예제 #1
0
if __name__ == "__main__":
    import sys

    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''

    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(
            tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (
                ' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)),
                item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (
                item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'
from util import *
from Tokenizer import PTBTokenizer


assert len(sys.argv) == 1


# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
        fields[2] = '||'.join(corrections)
import re
import os
from m2util import *
from Tokenizer import PTBTokenizer

assert len(sys.argv) == 1

# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
        tok_end = len(tokenizer.tokenize(postfix))
        start_end = str(tok_start) + " " + str(tok_end)
        fields[0] = start_end
        # tokenize corrections, remove trailing whitespace
        corrections = [(' '.join(tokenizer.tokenize(c))).strip()
                       for c in fields[2].split('||')]
예제 #4
0
def token_offset(sentence, offset):
    return len(tokenizer.tokenize(sentence[:offset], ptbTokenization=True))

if __name__ == "__main__":
    import sys
    
    # parse xml
    handler = AnnotationHandler()
    xml.sax.parse(sys.stdin, handler)

    # print header
    print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<sentences>'''
    
    # convert to token level offsets and output
    for id, sentence in sorted(handler.sentences.items()):
        print '    <sentence id="%d">' % id
        print '        <text>%s</text>' % ' '.join(tokenizer.tokenize(sentence))
        print '        <aspectTerms>'
        for item in handler.aspect_terms[id]:
            start_token = token_offset(sentence, item.start)
            end_token = token_offset(sentence, item.end)
            print '            <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)), item.polarity, start_token, end_token)
        print '        </aspectTerms>'
        print '        <aspectCategories>'
        for item in handler.aspect_categories[id]:
            print '            <aspectCategory category="%s" polarity="%s"/>' % (item.category, item.polarity)
        print '        </aspectCategories>'
        print '    </sentence>'
    print '</sentences>'