예제 #1
0
    def createWordLookup(self, foreignSentence):
        corpus = Corpus()
        tokenDictList = []

        """Captures only words, no spaces/punctuation"""
        spanishTokens = re.compile('(\W+)', re.UNICODE).split(unicode(foreignSentence, 'utf-8'))
        spanishTokens.pop()
        
        for idx, token in enumerate(spanishTokens):
            tokenDict = dict()
            tokenDict['originalToken'] = token
            tokenDict['spanish_POS'] = corpus.spanishTags().get(token, None)
            if (len(token) > 0):
                if token[0].isupper():
                    tokenDict['upper'] = True
                else:
                    tokenDict['upper'] = False
            else:
                tokenDict['upper'] = False
            tokenDictList.append(tokenDict)
            
        self.tokenDictList = tokenDictList