Python PTBTokenizer.tokenize Examples

Programming Language: Python

Namespace/Package Name: Tokenizer

Class/Type: PTBTokenizer

Method/Function: tokenize

Examples at hotexamples.com: 4

Python PTBTokenizer.tokenize - 4 examples found. These are the top rated real world Python examples of Tokenizer.PTBTokenizer.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

tokenize(2)

PTBTokenizer(1)

Example #1

Show file

if __name__ == "__main__":
    import sys

    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''

    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(
            tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (
                ' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)),
                item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (
                item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'

Example #2

Show file

File: token_offsets.py Project: KentonMurray/Non-nativeEnglishGrammarCorrection

from util import *
from Tokenizer import PTBTokenizer


assert len(sys.argv) == 1


# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
        fields[2] = '||'.join(corrections)

Example #3

Show file

File: token_offsets.py Project: danielhers/assess_learner_language

import re
import os
from m2util import *
from Tokenizer import PTBTokenizer

assert len(sys.argv) == 1

# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip()
                       for c in fields[2].split('||')]

Example #4

Show file

File: token_annotation.py Project: happywwy/semeval14_crf

def token_offset(sentence, offset):
    return len(tokenizer.tokenize(sentence[:offset], ptbTokenization=True))

if __name__ == "__main__":
    import sys
    
    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''
    
    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)), item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'