Exemple #1
0
 OUTPUT      Output xml file with tokenized text
                                                                                                                                                                                         
"""

import xml.sax
import sys
from collections import namedtuple, defaultdict

from Tokenizer import PTBTokenizer

# data types for annotation
aspect_term = namedtuple("aspect_term", "term polarity start end")
aspect_category = namedtuple("aspect_category", "category polarity")

# create Penn Treebank tokenizer
tokenizer = PTBTokenizer()


class AnnotationHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.sentences = {}
        self.aspect_terms = defaultdict(list)
        self.aspect_categories = defaultdict(list)
        self.text = ""

    def startElement(self, name, attrs):
        if name == "sentence":
            self.id = int(attrs['id'])
        elif name == "text":
            self.text = ""
        elif name == "aspectTerm":
# convert character to token offsets, tokenize sentence
#
# usage: %prog  < input > output
#

import sys
import re
import os
from m2util import *
from Tokenizer import PTBTokenizer

assert len(sys.argv) == 1

# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
"""

import xml.sax
import sys
from collections import namedtuple, defaultdict

from Tokenizer import PTBTokenizer


# data types for annotation 
aspect_term = namedtuple("aspect_term", "term polarity start end")
aspect_category = namedtuple("aspect_category", "category polarity")


# create Penn Treebank tokenizer
tokenizer = PTBTokenizer()


class AnnotationHandler(xml.sax.ContentHandler):
    def __init__(self):
	self.sentences = {}
	self.aspect_terms = defaultdict(list)
	self.aspect_categories = defaultdict(list)
        self.text = ""

    def startElement(self, name, attrs):
	if name == "sentence":
	    self.id = int(attrs['id'])
	elif name == "text":
            self.text = ""
	elif name == "aspectTerm":
#


import sys
import re
import os
from util import *
from Tokenizer import PTBTokenizer


assert len(sys.argv) == 1


# main
# loop over sentences cum annotation
tokenizer = PTBTokenizer()
sentence = ''
for line in sys.stdin:
    line = line.decode("utf8").strip()
    if line.startswith("S "):
        sentence = line[2:]
        sentence_tok = "S " + ' '.join(tokenizer.tokenize(sentence))
        print sentence_tok.encode("utf8")
    elif line.startswith("A "):
        fields = line[2:].split('|||')
        start_end = fields[0]
        char_start, char_end = [int(a) for a in start_end.split()]
        # calculate token offsets
        prefix = sentence[:char_start]
        tok_start = len(tokenizer.tokenize(prefix))
        postfix = sentence[:char_end]
#
# convert source xml file and gold annotation to
# merged file with sentence-per-line sentences and
# annotation.
#
# usage : %prog [-p] source.xml [gold.xml] > output

from Tokenizer import PTBTokenizer
import xml.dom.minidom
import sys
import re
import getopt
from util import fix_cp1252codes

## global variables
tokenizer = PTBTokenizer()


def slice_paragraph(text):
    yield (0, len(text), text)


def slice_tokenize(text):
    import nltk
    sentence_spliter = nltk.data.load('tokenizers/punkt/english.pickle')
    last_break = 0
    for match in sentence_spliter._lang_vars.period_context_re().finditer(
            text):
        context = match.group() + match.group('after_tok')
        if sentence_spliter.text_contains_sentbreak(context):
            yield (last_break, match.end(), text[last_break:match.end()])