Python TinySegmenter.tokenize Examples

Programming Language: Python

Namespace/Package Name: tinysegmenter

Class/Type: TinySegmenter

Method/Function: tokenize

Examples at hotexamples.com: 4

Python TinySegmenter.tokenize - 4 examples found. These are the top rated real world Python examples of tinysegmenter.TinySegmenter.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TinySegmenter(9)

tokenize(2)

Example #1

Show file

def wrapper_jp(string, width):
    """Japanese string with newline wrapping function"""
    segmenter = TinySegmenter()
    tokens = segmenter.tokenize(string)
    token_remain = lambda: len(tokens) > 0
    # save lines shorter than width into result
    result = ""
    while token_remain():
        line = ""
        # accumulate tokens whose total is shorter than width into line
        while token_remain() and len(line + tokens[0]) <= width:
            line += tokens.pop(0)
        else:
            result += line + ('\n' if token_remain() else '')
    # print(result)
    return result

Example #2

Show file

File: say-ust.py Project: logimac/SayUstTweet

    def on_status(self, status):
        
        flg = 0
        for buff in exception_ids:
            if (status.author.screen_name == buff) : flg = 1
        
        if not hasattr(status, 'retweeted_status') and flg != 1:
            try:
                print u'\n---{name}/@{screen}---\n   {text}\nvia {src} {created}'.format(
                        name = status.author.name,
                        screen = status.author.screen_name,
                        text = status.text.replace('&amp;','&'),
                        src = status.source,
                        created = status.created_at)
                read_text = str_replace(status.author.name.decode('utf-8')) + 'さん　' + str_replace(status.text.decode('utf-8'))
            
                ts = TinySegmenter()
                result = ts.tokenize(read_text)
                string_jp = ''
                string_en = ''
                for seg in result:
                    seg = re.sub('^\s+', '', seg)
                    if (re.match(u'(?:[^\u0000-\u007F]|[\d+]|^[A-Za-rt-z]{1}$)', seg)) and not re.match(u'^[aA]$', seg) :#日本語が含まれる
                        call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True)
                        string_en = ''
                        string_jp = string_jp + seg
                    else :
                        call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True)
                        string_jp = ''
                        string_en = string_en + ' ' + seg

                if(string_jp) :
                    call(['SayKotoeri2 -s 110 "{text}" >/dev/null 2>&1'.format(text=string_jp)], shell=True)
                else :
                    call(['echo "{text}" | say -v Victoria -r 200 >/dev/null 2>&1'.format(text=string_en)], shell=True)

            except Exception, e:
                print >> sys.stderr, 'Encountered Exception:', e
                pass

Example #3

Show file

File: tinysegmenter.py Project: rokujyouhitoma/cython-traning

def demo():
    segmenter = TinySegmenter()
    print(u' | '.join(segmenter.tokenize(u"私の名前は中野です")).encode('utf-8'))

Example #4

Show file

File: tinysegmenter.py Project: rokujyouhitoma/traning-center

def demo():
    segmenter = TinySegmenter()
    print(u' | '.join(segmenter.tokenize(u"私の名前は中野です")).encode('utf-8'))