def __init__(self, unit=UNIT_WORD, **property_names): """ Creates a SemcorTokenizer. @param unit: one of 'word', 'sentence' or 'paragraph'; indicating the level of hierarchy to be processed. @type unit: C{String} """ assert unit in [ SemcorTokenizer.UNIT_WORD, SemcorTokenizer.UNIT_SENTENCE, SemcorTokenizer.UNIT_PARAGRAPH ] self._unit = unit self._parse_method = _parseSGMLString # if it were valid XML, we could use this: #self._parse_method = xml.dom.minidom.parseString AbstractTokenizer.__init__(self, **property_names)
def __init__(self, buffer_size=1024, **property_names): xml.sax.ContentHandler.__init__(self) self._lemma = '' self._buffer_size = buffer_size self.reset() AbstractTokenizer.__init__(self, **property_names)
def __init__(self, **property_names): AbstractTokenizer.__init__(self, **property_names)