Example #1
0
 def __init__(self,
              lang='nl',
              tokens_glue=None,
              tokens_aligner=fTokensAligner(
                  fTokensAligner.fRegexpTokenFinder())):
     assert tokens_aligner
     Tokenizer.__init__(self, lang)
     self.text_parse_re = re.compile('([^/\s]+/[A-Z0]+)', re.U)
     self.tokens_aligner = tokens_aligner
     self.glue = tokens_glue
Example #2
0
    def test_token_aligner(self):
                #           1         2         3          4         5          6         7    
                #01234567890123456789012345678901 23456789012345678901 2345678901234567890123456789        
        otext = "  Amsterdam is capital of The  \t Netherlands. Den  \n,Haag is just a Dutch city."
                
                #           1         2         3         4         5         6         7    
                #01234567890123456789012345678901234567890123456789012345678901234567890123456789        
        ptext = "Amsterdam is capital of The Netherlands. Den Haag is just a Dutch city."        
        
        tokens = ( ["Amsterdam", 0, 9, 0], ["The Netherlands", 24, 15, 0], ["Den Haag", 41, 8, 0], ["Dutch", 60, 5, 0] )
        for tt in tokens:
            self.assertEqual(ptext[tt[1]:tt[1] + tt[2]], tt[0])

        fTokensAligner()(otext, ptext, tokens)
        self.assertEqual(tokens[0][1], 2)
        self.assertEqual(tokens[0][2], 9)
        self.assertEqual(tokens[1][1], 26)
        self.assertEqual(tokens[1][2], 18)
        self.assertEqual(tokens[2][1], 46)
        self.assertEqual(tokens[2][2], 11)
        self.assertEqual(tokens[3][1], 68)
        self.assertEqual(tokens[3][2], 5)
Example #3
0
    def test_token_aligner(self):
        #           1         2         3          4         5          6         7
        #01234567890123456789012345678901 23456789012345678901 2345678901234567890123456789
        otext = "  Amsterdam is capital of The  \t Netherlands. Den  \n,Haag is just a Dutch city."

        #           1         2         3         4         5         6         7
        #01234567890123456789012345678901234567890123456789012345678901234567890123456789
        ptext = "Amsterdam is capital of The Netherlands. Den Haag is just a Dutch city."

        tokens = (["Amsterdam", 0, 9, 0], ["The Netherlands", 24, 15,
                                           0], ["Den Haag", 41, 8,
                                                0], ["Dutch", 60, 5, 0])
        for tt in tokens:
            self.assertEqual(ptext[tt[1]:tt[1] + tt[2]], tt[0])

        fTokensAligner()(otext, ptext, tokens)
        self.assertEqual(tokens[0][1], 2)
        self.assertEqual(tokens[0][2], 9)
        self.assertEqual(tokens[1][1], 26)
        self.assertEqual(tokens[1][2], 18)
        self.assertEqual(tokens[2][1], 46)
        self.assertEqual(tokens[2][2], 11)
        self.assertEqual(tokens[3][1], 68)
        self.assertEqual(tokens[3][2], 5)
Example #4
0
 def __init__(self, lang = "nl"):
     assert lang == 'nl'
     TNTExternalTool.__init__(self, lang, None, fTokensAligner(fTokensAligner.fRegexpTokenFinder()))
     self.tags_map = {  'LET':Token.POS_PUNCT, 'N': Token.POS_NOUN, 'ADJ':Token.POS_ADJ, 'WW':Token.POS_VERB, 
                        'TW':Token.POS_NUM,  'VNW': Token.POS_PRONOUN, 'VZ':Token.POS_PREP, 'BW':Token.POS_ADVERB,
                        'LID':Token.POS_ART, 'VG':Token.POS_UNKNOWN, 'TSW':Token.POS_UNKNOWN }
Example #5
0
 def __init__(self, lang='nl', tokens_glue = None, tokens_aligner =fTokensAligner(fTokensAligner.fRegexpTokenFinder())):
     assert tokens_aligner
     Tokenizer.__init__(self, lang)
     self.text_parse_re  = re.compile('([^/\s]+/[A-Z0]+)', re.U)
     self.tokens_aligner = tokens_aligner
     self.glue = tokens_glue