Beispiel #1
0
 def test_flat_doc(self):
     'Input doc with just top-level text'
     x = amara.parse('<a>one two three four five six seven eight nine</a>')
     for i in range(1, 11):
         trimmed_tree = util.trim_word_count(x, i)
         word_count = len(trimmed_tree.xml_select(u'string(.)').split())
         self.assertEquals(word_count, min(i, 9))
Beispiel #2
0
 def test_nested_doc(self):
     'Input doc with text in nested elements'
     x = amara.parse('<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>')
     for i in range(1, 11):
         trimmed_tree = util.trim_word_count(x, i)
         word_count = len(trimmed_tree.xml_select(u'string(.)').split())
         self.assertEquals(word_count, min(i, 9))
Beispiel #3
0
 def test_flat_doc(self):
     'Input doc with just top-level text'
     x = amara.parse('<a>one two three four five six seven eight nine</a>')
     for i in range(1, 11):
         trimmed_tree = util.trim_word_count(x, i)
         word_count = len(trimmed_tree.xml_select(u'string(.)').split())
         self.assertEquals(word_count, min(i, 9))
Beispiel #4
0
 def test_nested_doc(self):
     'Input doc with text in nested elements'
     x = amara.parse(
         '<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>'
     )
     for i in range(1, 11):
         trimmed_tree = util.trim_word_count(x, i)
         word_count = len(trimmed_tree.xml_select(u'string(.)').split())
         self.assertEquals(word_count, min(i, 9))
Beispiel #5
0
def akara_twc(body, ctype, max=None, html='no'):
    '''
    Take some POSTed markup and return a version with words trimmed, but intelligently,
    with understanding of markup, so that tags are not counted, and the structure of
    sub-elements included in the same set is preserved.

    max (query parameter) - which is the maximum word count of the resulting text
    html (query parameter) - if 'yes', try to parse the input as HTML

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7"
    '''
    #Raises ValueError
    #Is there a monadic approach we can provide for Akara for error handling?  This cries out for "Maybe"
    #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert)
    max_ = int(max) if max else 500
    if html == 'yes':
        doc = htmldoc.parse(body)
    else:
        doc = amara.parse(body)
    return trim_word_count(doc, max_)
Beispiel #6
0
def akara_twc(body, ctype, max=None, html='no'):
    '''
    Take some POSTed markup and return a version with words trimmed, but intelligently,
    with understanding of markup, so that tags are not counted, and the structure of
    sub-elements included in the same set is preserved.

    max (query parameter) - which is the maximum word count of the resulting text
    html (query parameter) - if 'yes', try to parse the input as HTML

    Sample request:
    curl --request POST --data-binary "<a>one two <b>three four </b><c>five <d>six seven</d> eight</c> nine</a>" --header "Content-Type: application/xml" "http://localhost:8880/akara.twc?max=7"
    '''
    #Raises ValueError
    #Is there a monadic approach we can provide for Akara for error handling?  This cries out for "Maybe"
    #(OK OK, the idea of Maybe, but more of the simple expressiveness of assert)
    max_ = int(max) if max else 500
    if html == 'yes':
        doc = htmldoc.parse(body)
    else:
        doc = amara.parse(body)
    return trim_word_count(doc, max_)