def __init__(self, lang='nl', tokens_glue=None, tokens_aligner=fTokensAligner( fTokensAligner.fRegexpTokenFinder())): assert tokens_aligner Tokenizer.__init__(self, lang) self.text_parse_re = re.compile('([^/\s]+/[A-Z0]+)', re.U) self.tokens_aligner = tokens_aligner self.glue = tokens_glue
def _tokenize(self, text): tokens, pos = super(NER, self)._tokenize(text), 0 s = fTokensAligner.fRegexpTokenFinder() for t in tokens: r = s(text, t[0], pos) if r: t[1], t[2], pos = r[1], r[2], r[1] + r[2] else: print "Entity '%s' location cannot be identified properly" % t[0] #raise BaseException("Entity '%s' location cannot be identified properly" % t[0]) return tokens
def _tokenize(self, text): tokens, pos = super(NER, self)._tokenize(text), 0 s = fTokensAligner.fRegexpTokenFinder() for t in tokens: r = s(text, t[0], pos) if r: t[1], t[2], pos = r[1], r[2], r[1] + r[2] else: print "Entity '%s' location cannot be identified properly" % t[ 0] #raise BaseException("Entity '%s' location cannot be identified properly" % t[0]) return tokens
def __init__(self, lang = "nl"): assert lang == 'nl' TNTExternalTool.__init__(self, lang, None, fTokensAligner(fTokensAligner.fRegexpTokenFinder())) self.tags_map = { 'LET':Token.POS_PUNCT, 'N': Token.POS_NOUN, 'ADJ':Token.POS_ADJ, 'WW':Token.POS_VERB, 'TW':Token.POS_NUM, 'VNW': Token.POS_PRONOUN, 'VZ':Token.POS_PREP, 'BW':Token.POS_ADVERB, 'LID':Token.POS_ART, 'VG':Token.POS_UNKNOWN, 'TSW':Token.POS_UNKNOWN }
def __init__(self, lang='nl', tokens_glue = None, tokens_aligner =fTokensAligner(fTokensAligner.fRegexpTokenFinder())): assert tokens_aligner Tokenizer.__init__(self, lang) self.text_parse_re = re.compile('([^/\s]+/[A-Z0]+)', re.U) self.tokens_aligner = tokens_aligner self.glue = tokens_glue