def __init__(self, lang='nl', tokens_glue=None, tokens_aligner=fTokensAligner( fTokensAligner.fRegexpTokenFinder())): assert tokens_aligner Tokenizer.__init__(self, lang) self.text_parse_re = re.compile('([^/\s]+/[A-Z0]+)', re.U) self.tokens_aligner = tokens_aligner self.glue = tokens_glue
def test_token_aligner(self): # 1 2 3 4 5 6 7 #01234567890123456789012345678901 23456789012345678901 2345678901234567890123456789 otext = " Amsterdam is capital of The \t Netherlands. Den \n,Haag is just a Dutch city." # 1 2 3 4 5 6 7 #01234567890123456789012345678901234567890123456789012345678901234567890123456789 ptext = "Amsterdam is capital of The Netherlands. Den Haag is just a Dutch city." tokens = ( ["Amsterdam", 0, 9, 0], ["The Netherlands", 24, 15, 0], ["Den Haag", 41, 8, 0], ["Dutch", 60, 5, 0] ) for tt in tokens: self.assertEqual(ptext[tt[1]:tt[1] + tt[2]], tt[0]) fTokensAligner()(otext, ptext, tokens) self.assertEqual(tokens[0][1], 2) self.assertEqual(tokens[0][2], 9) self.assertEqual(tokens[1][1], 26) self.assertEqual(tokens[1][2], 18) self.assertEqual(tokens[2][1], 46) self.assertEqual(tokens[2][2], 11) self.assertEqual(tokens[3][1], 68) self.assertEqual(tokens[3][2], 5)
def test_token_aligner(self): # 1 2 3 4 5 6 7 #01234567890123456789012345678901 23456789012345678901 2345678901234567890123456789 otext = " Amsterdam is capital of The \t Netherlands. Den \n,Haag is just a Dutch city." # 1 2 3 4 5 6 7 #01234567890123456789012345678901234567890123456789012345678901234567890123456789 ptext = "Amsterdam is capital of The Netherlands. Den Haag is just a Dutch city." tokens = (["Amsterdam", 0, 9, 0], ["The Netherlands", 24, 15, 0], ["Den Haag", 41, 8, 0], ["Dutch", 60, 5, 0]) for tt in tokens: self.assertEqual(ptext[tt[1]:tt[1] + tt[2]], tt[0]) fTokensAligner()(otext, ptext, tokens) self.assertEqual(tokens[0][1], 2) self.assertEqual(tokens[0][2], 9) self.assertEqual(tokens[1][1], 26) self.assertEqual(tokens[1][2], 18) self.assertEqual(tokens[2][1], 46) self.assertEqual(tokens[2][2], 11) self.assertEqual(tokens[3][1], 68) self.assertEqual(tokens[3][2], 5)
def __init__(self, lang = "nl"): assert lang == 'nl' TNTExternalTool.__init__(self, lang, None, fTokensAligner(fTokensAligner.fRegexpTokenFinder())) self.tags_map = { 'LET':Token.POS_PUNCT, 'N': Token.POS_NOUN, 'ADJ':Token.POS_ADJ, 'WW':Token.POS_VERB, 'TW':Token.POS_NUM, 'VNW': Token.POS_PRONOUN, 'VZ':Token.POS_PREP, 'BW':Token.POS_ADVERB, 'LID':Token.POS_ART, 'VG':Token.POS_UNKNOWN, 'TSW':Token.POS_UNKNOWN }
def __init__(self, lang='nl', tokens_glue = None, tokens_aligner =fTokensAligner(fTokensAligner.fRegexpTokenFinder())): assert tokens_aligner Tokenizer.__init__(self, lang) self.text_parse_re = re.compile('([^/\s]+/[A-Z0]+)', re.U) self.tokens_aligner = tokens_aligner self.glue = tokens_glue