def emit(word_it):
        for word in word_it:

            yield vd(int, { 
                        word : 1 ,
                        'begin_with' : 
                            vd(int, { word[0] : 1 }) ,
                        'has_size' :
                            vd(int, { len(word) : 1 } )
                        
           })
Exemple #2
0
def word_count( unicode_file ):

    exclude = set(string.punctuation)
    def clean(exlcude):
        def _clean(word):
            return ''.join(ch for ch in word if ch not in exclude)
        return _clean

    sp_pattern = re.compile( """[\.\!\"\s\-\,\']+""", re.M)
    res = vd( int, {})
    for line in iter(open(unicode_file ) ):
        for word in map( clean(exclude),  
                map( str.lower, sp_pattern.split(line ))
            ):
            if len(word) > 2 :
                res += vd(int, { 
                    word : 1 ,
                    'begin_with' : 
                        vd(int, { word[0] : 1 }) ,
                    'has_size' :
                        vd(int, { len(word) : 1 } )
                    })

    return res