if __name__ == "__main__": import sys # parse xml handler = AnnotationHandler() xml.sax.parse(sys.stdin, handler) # print header print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?> <sentences>''' # convert to token level offsets and output for id, sentence in sorted(handler.sentences.items()): print ' <sentence id="%d">' % id print ' <text>%s</text>' % ' '.join( tokenizer.tokenize(sentence)) print ' <aspectTerms>' for item in handler.aspect_terms[id]: start_token = token_offset(sentence, item.start) end_token = token_offset(sentence, item.end) print ' <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % ( ' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)), item.polarity, start_token, end_token) print ' </aspectTerms>' print ' <aspectCategories>' for item in handler.aspect_categories[id]: print ' <aspectCategory category="%s" polarity="%s"/>' % ( item.category, item.polarity) print ' </aspectCategories>' print ' </sentence>' print '</sentences>'
from util import * from Tokenizer import PTBTokenizer assert len(sys.argv) == 1 # main # loop over sentences cum annotation tokenizer = PTBTokenizer() sentence = '' for line in sys.stdin: line = line.decode("utf8").strip() if line.startswith("S "): sentence = line[2:] sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) print sentence_tok.encode("utf8") elif line.startswith("A "): fields = line[2:].split('|||') start_end = fields[0] char_start, char_end = [int(a) for a in start_end.split()] # calculate token offsets prefix = sentence[:char_start] tok_start = len(tokenizer.tokenize(prefix)) postfix = sentence[:char_end] tok_end = len(tokenizer.tokenize(postfix)) start_end = str(tok_start) + " " + str(tok_end) fields[0] = start_end # tokenize corrections, remove trailing whitespace corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')] fields[2] = '||'.join(corrections)
import re import os from m2util import * from Tokenizer import PTBTokenizer assert len(sys.argv) == 1 # main # loop over sentences cum annotation tokenizer = PTBTokenizer() sentence = '' for line in sys.stdin: line = line.decode("utf8").strip() if line.startswith("S "): sentence = line[2:] sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) print sentence_tok.encode("utf8") elif line.startswith("A "): fields = line[2:].split('|||') start_end = fields[0] char_start, char_end = [int(a) for a in start_end.split()] # calculate token offsets prefix = sentence[:char_start] tok_start = len(tokenizer.tokenize(prefix)) postfix = sentence[:char_end] tok_end = len(tokenizer.tokenize(postfix)) start_end = str(tok_start) + " " + str(tok_end) fields[0] = start_end # tokenize corrections, remove trailing whitespace corrections = [(' '.join(tokenizer.tokenize(c))).strip() for c in fields[2].split('||')]
def token_offset(sentence, offset): return len(tokenizer.tokenize(sentence[:offset], ptbTokenization=True)) if __name__ == "__main__": import sys # parse xml handler = AnnotationHandler() xml.sax.parse(sys.stdin, handler) # print header print '''<?xml version="1.0" encoding="UTF-8" standalone="yes"?> <sentences>''' # convert to token level offsets and output for id, sentence in sorted(handler.sentences.items()): print ' <sentence id="%d">' % id print ' <text>%s</text>' % ' '.join(tokenizer.tokenize(sentence)) print ' <aspectTerms>' for item in handler.aspect_terms[id]: start_token = token_offset(sentence, item.start) end_token = token_offset(sentence, item.end) print ' <aspectTerm term="%s" polarity="%s" from="%d" to="%d"/>' % (' '.join(tokenizer.tokenize(item.term, ptbTokenization=True)), item.polarity, start_token, end_token) print ' </aspectTerms>' print ' <aspectCategories>' for item in handler.aspect_categories[id]: print ' <aspectCategory category="%s" polarity="%s"/>' % (item.category, item.polarity) print ' </aspectCategories>' print ' </sentence>' print '</sentences>'