OUTPUT Output xml file with tokenized text """ import xml.sax import sys from collections import namedtuple, defaultdict from Tokenizer import PTBTokenizer # data types for annotation aspect_term = namedtuple("aspect_term", "term polarity start end") aspect_category = namedtuple("aspect_category", "category polarity") # create Penn Treebank tokenizer tokenizer = PTBTokenizer() class AnnotationHandler(xml.sax.ContentHandler): def __init__(self): self.sentences = {} self.aspect_terms = defaultdict(list) self.aspect_categories = defaultdict(list) self.text = "" def startElement(self, name, attrs): if name == "sentence": self.id = int(attrs['id']) elif name == "text": self.text = "" elif name == "aspectTerm":
# convert character to token offsets, tokenize sentence # # usage: %prog < input > output # import sys import re import os from m2util import * from Tokenizer import PTBTokenizer assert len(sys.argv) == 1 # main # loop over sentences cum annotation tokenizer = PTBTokenizer() sentence = '' for line in sys.stdin: line = line.decode("utf8").strip() if line.startswith("S "): sentence = line[2:] sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) print sentence_tok.encode("utf8") elif line.startswith("A "): fields = line[2:].split('|||') start_end = fields[0] char_start, char_end = [int(a) for a in start_end.split()] # calculate token offsets prefix = sentence[:char_start] tok_start = len(tokenizer.tokenize(prefix)) postfix = sentence[:char_end]
""" import xml.sax import sys from collections import namedtuple, defaultdict from Tokenizer import PTBTokenizer # data types for annotation aspect_term = namedtuple("aspect_term", "term polarity start end") aspect_category = namedtuple("aspect_category", "category polarity") # create Penn Treebank tokenizer tokenizer = PTBTokenizer() class AnnotationHandler(xml.sax.ContentHandler): def __init__(self): self.sentences = {} self.aspect_terms = defaultdict(list) self.aspect_categories = defaultdict(list) self.text = "" def startElement(self, name, attrs): if name == "sentence": self.id = int(attrs['id']) elif name == "text": self.text = "" elif name == "aspectTerm":
# import sys import re import os from util import * from Tokenizer import PTBTokenizer assert len(sys.argv) == 1 # main # loop over sentences cum annotation tokenizer = PTBTokenizer() sentence = '' for line in sys.stdin: line = line.decode("utf8").strip() if line.startswith("S "): sentence = line[2:] sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence)) print sentence_tok.encode("utf8") elif line.startswith("A "): fields = line[2:].split('|||') start_end = fields[0] char_start, char_end = [int(a) for a in start_end.split()] # calculate token offsets prefix = sentence[:char_start] tok_start = len(tokenizer.tokenize(prefix)) postfix = sentence[:char_end]
# # convert source xml file and gold annotation to # merged file with sentence-per-line sentences and # annotation. # # usage : %prog [-p] source.xml [gold.xml] > output from Tokenizer import PTBTokenizer import xml.dom.minidom import sys import re import getopt from util import fix_cp1252codes ## global variables tokenizer = PTBTokenizer() def slice_paragraph(text): yield (0, len(text), text) def slice_tokenize(text): import nltk sentence_spliter = nltk.data.load('tokenizers/punkt/english.pickle') last_break = 0 for match in sentence_spliter._lang_vars.period_context_re().finditer( text): context = match.group() + match.group('after_tok') if sentence_spliter.text_contains_sentbreak(context): yield (last_break, match.end(), text[last_break:match.end()])