Python PTBTokenizer.tokenize 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: Tokenizer

클래스/타입: PTBTokenizer

메소드/함수: tokenize

hotexamples.com에서의 예제들: 4

Python PTBTokenizer.tokenize - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 Tokenizer.PTBTokenizer.tokenize에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

tokenize(2)

PTBTokenizer(1)

예제 #1

파일 보기

if __name__ == "__main__":
    import sys

    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''

    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(
            tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (
                ' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)),
                item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (
                item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'

예제 #2

파일 보기

파일: token_offsets.py 프로젝트: KentonMurray/Non-nativeEnglishGrammarCorrection

from util import *
from Tokenizer import PTBTokenizer


assert len(sys.argv) == 1


# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
        fields[2] = '||'.join(corrections)

예제 #3

파일 보기

파일: token_offsets.py 프로젝트: danielhers/assess_learner_language

import re
import os
from m2util import *
from Tokenizer import PTBTokenizer

assert len(sys.argv) == 1

# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip()
                       for c in fields[2].split('||')]

예제 #4

파일 보기

파일: token_annotation.py 프로젝트: happywwy/semeval14_crf

def token_offset(sentence, offset):
    return len(tokenizer.tokenize(sentence[:offset], ptbTokenization=True))

if __name__ == "__main__":
    import sys
    
    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''
    
    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)), item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'