def testFilterNoiseWords(self):
        strTest = u"!-?- hello how !!!! are you *-+$"
        strGt = u"hello how are you"

        f = LMPreparationFormula()
        f.setText(strTest)
        strTest = f._filterNoiseWords()

        self.assertEquals(strGt, strTest)
Exemple #2
0
    def testContractionPrefixes(self):
        testList = [(r"President' s", r"president's", 3),
                    (r"President' s of", r"president's of", 3)]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(False)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
    def testNormalizeCharacters(self):
        strTest = ur"a b c \uff1b , % œ"
        strGt = ur"a b c % oe"

        f = LMPreparationFormula()
        f.setText(strTest)
        f._normalizeUtf8()
        f._normalizePunctuation(self.allPunctList)
        self.assertEquals(strGt, f.getText())
    def testNormalizePunctuationKeepInWords(self):
        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "HES-SO und AdG/LA auch im Winter Sommer"
        self.assertEquals(gt, strResult)
Exemple #5
0
    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages:
            testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages:
            gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))
    def testContractionPrefixes(self):
        testList =[(ur"President' s", ur"president's", 3),
                   (ur"President' s of", ur"president's of", 3)]

        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemple #8
0
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
Exemple #9
0
    def testExpandNumberInWords(self):
        testList = [(r"A1", r"A. 1"), (r"P3B", r"P. 3 B."), (r"P5B4", r"P. 5 B. 4"),
                    (r"PPB5", r"PPB 5"), (r"10jährige", r"10 jährige")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setExpandNumberInWords(False)
        testList = [(r"1er", r"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"1st", r"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"18-jähriger", r"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)
    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages: testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages: gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ",
                            strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))
Exemple #12
0
    def testGerman(self):
        testList = [(ur"emmaüs", ur"emmaüs"), (u"mein àrbeit", u"mein àrbeit"),
                    (ur"môchten", ur"môchten"), (ur"mädchen", ur"mädchen"),
                    (ur"meîn", ur"meîn"), (ur"meïn", ur"meïn")]

        f = LMPreparationFormula()
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"),(ur"P3B", ur"P. 3 B."), (ur"P5B4", ur"P. 5 B. 4"),
                     (ur"PPB5",ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setKeepNewWords(False)
        testList = [(ur"1er",ur"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(ur"1st",ur"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(ur"18-jähriger", ur"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)
Exemple #14
0
    def testFrench(self):
        testList = [(ur"à plus tard", ur"à plus tard"),
                    (ur"maîtres", ur"maîtres"), (ur"maïs", ur"maïs"),
                    (ur"emmaüs", ur"emmaüs"), (ur"mäman", ur"mäman")]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemple #15
0
    def testAll(self):
        testList = [("A dix heures", "à dix heures", False),
                    ("1. Election", "premièrement election", False),
                    ("R1", "r. un", False), (r"A1", r"a. un",
                                             False), (r"P3B", r"p. trois b.", False),
                    (r"P5B4", r"p. cinq b. quatre", False),
                    (r"PPB5", r"p. p. b. cinq", False),
                    (r"rte", r"route", False),
                    (r"Constantin, p. l. r., président de",
                     r"constantin p. l. r. président de", False),
                    (r"/ HES-SO und AdG/LA - auch im Winter / Sommer -", r"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setExpandNumberInWords(not knw)
            r = f.prepareText()
            self.assertEqual(gt, r)
Exemple #16
0
    def testAll(self):
        testList = [(u"A dix heures", u"à dix heures", False),
                    (u"1. Election", u"premièrement election", False),
                    (u"R1", u"r. un", False), (ur"A1", ur"a. un", False),
                    (ur"P3B", ur"p. trois b.", False),
                    (ur"P5B4", ur"p. cinq b. quatre", False),
                    (ur"PPB5", ur"p. p. b.  cinq", False),
                    (ur"rte", ur"route", False),
                    (ur"Constantin, p. l. r., président de",
                     ur"constantin p. l. r. président de", False),
                    (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",
                     ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures", False),
                   (u"1. Election",u"premièrement election", False),
                   (u"R1",u"r. un", False), (ur"A1", ur"a. un", False),(ur"P3B", ur"p. trois b.", False),
                   (ur"P5B4", ur"p. cinq b. quatre", False),
                   (ur"PPB5",ur"p. p. b.  cinq", False),
                   (ur"rte",ur"route", False),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de", False),
                   (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route"),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", 
                        strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))
    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode("utf-8"), strResult.encode("utf-8"))
    def testFrench(self):
        testList =[(ur"à plus tard",ur"à plus tard"),
                   (ur"maîtres",ur"maîtres"),
                   (ur"maïs",ur"maïs"),
                   (ur"emmaüs",ur"emmaüs"),
                   (ur"mäman",ur"mäman"),
                   (ur"1er", ur"premier"),
                   (ur"20ème", ur"vingtième"),
                   (ur"18-age", ur"dix huit age")]

        #No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setKeepNewWords(True)

        testList =[(ur"18-age", ur"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemple #24
0
    def testExpandAcronyms(self):
        testList = [(u"PDCB.", u"p. d. c. b."), (u"PDC:", u"p. d. c.")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandAcronyms)
Exemple #25
0
    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"), (ur"P3B", ur"P. 3 B."),
                    (ur"P5B4", ur"P. 5 B. 4"), (ur"PPB5", ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)
Exemple #26
0
class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """

    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE = 'language'
    ID_COUNTER = 0

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        # Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        # Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        # Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        # Actual data
        self.addElement(sentenceText)

        # LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
        self.lmPreparationFormula.setExpandNumberInWords(
            document.expandNumberInWords)

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation=False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return "---\n%s\n" % "\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    # Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)

    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText=strText,
                                                       removeDots=False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception(
                "Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()

        self.setTextSentence(strText)

    #####################
    # Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        # Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            # print strText
            TextCluster.logger.info(
                "Discard sentence: inappropriate length: %d! '%s'" %
                (len(strText)))
            return False

        # Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            # print strText
            TextCluster.logger.info(
                "Discard sentence, not enough or to many words, wordsNum = %d! '%s'"
                % (nbWords, strText))
            return False

        # Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            # print strText
            TextCluster.logger.info(
                "Discard sentence, to many groups of digits! '%s'" % strText)
            return False

        # Try decode
        # Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isValid2ndStage(self):
        """Check validity of sentence, this is the 2nd stage checking.
           Added by Yang WANG on Aug 9, 2016

           Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography .
        """
        strText = self.getTextSentence()

        # To filter out web addresses
        if strText.find( "http" ) >= 0 \
                or strText.find( "www" ) >= 0 \
                or strText.find( "html" ) >= 0 \
                or strText.find("URL") >= 0:
            TextCluster.logger.info("Discard sentence, web address! '%s'" %
                                    strText)
            return False

        # regular expression verification by German orthography:    https://en.wikipedia.org/wiki/German_orthography
        # pattern   = u"^[a-zA-ZäöüÄÖÜ0-9.,?\"'\-]+$"   # All allowed chars
        # pattern   = u"^[a-zA-ZäöüÄÖÜß]+[.|']?$"       # common char of
        # [a-zäöü] with an optional trailing dot or apostrophe '
        pattern = "^[a-zA-ZäöüÄÖÜß.']+$"
        # print( pattern )

        recmped = re.compile(pattern)  # re compiled
        words = strText.split()
        for word in words:
            # German orthography check
            result = recmped.match(word)
            if result is None:
                TextCluster.logger.info(
                    "Discard sentence, disobey German orthography rule (%s)! '%s' in '%s'"
                    % (pattern, word, strText))
                return False

            # Check for too long word
            if len(word) > MAX_WORD_LENGTH:
                TextCluster.logger.info(
                    "Discard sentence, too long word '%s' of length '%d'! In '%s'"
                    % (word, len(word), strText))

                # For temporary debugging: Output all long words for analysis
                if False:
                    cmd = 'echo "' + \
                        word + '" >> long_words.txt'
                    print(cmd)
                    os.system(cmd)

                return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        # Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            # Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            # Ignore case available
            # if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex, strText))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return key

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)
Exemple #27
0
 def testIsNoise(self):
     for p in list(string.punctuation):
         strTest = p * 4
         self.assertTrue(LMPreparationFormula._isNoise(strTest))
Exemple #28
0
class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """

    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE = 'language'
    ID_COUNTER = 0

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation=False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)

    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText=strText,
                                                       removeDots=False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception(
                "Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()

        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence: inappropriate length: %d!" % len(strText))
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence, not enough or to many words!")
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence, to many groups of digits!")
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info(
                    "Discard:%s\n%s" %
                    (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)
Exemple #29
0
    def testFrench(self):
        testList = [(r"à plus tard", r"à plus tard"),
                    (r"maîtres", r"maîtres"),
                    (r"maïs", r"maïs"),
                    (r"emmaüs", r"emmaüs"),
                    (r"mäman", r"mäman"),
                    (r"1er", r"premier"),
                    (r"20ème", r"vingtième"),
                    (r"18-age", r"dix huit age")]

        # No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setExpandNumberInWords(False)

        testList = [(r"18-age", r"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
 def testIsNoise(self):
     for p in list(string.punctuation):
         strTest = p*4
         self.assertTrue(LMPreparationFormula._isNoise(strTest))
Exemple #31
0
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at pour mille"
        self.assertEquals(gt, strResult)
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'-/@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEquals(gt, strResult)
Exemple #33
0
    def testGerman(self):
        testList = [(r"emmaüs", r"emmaüs"),
                    ("mein àrbeit", "mein àrbeit"),
                    (r"môchten", r"môchten"),
                    (r"mädchen", r"mädchen"),
                    (r"meîn", r"meîn"),
                    (r"meïn", r"meïn"),
                    (r"18-jähriger", r"achtzehn jähriger")]

        # No new words are kepts
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # New words are kept
        testList = [(r"18-jähriger", r"18-jähriger")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
    def testEnglish(self):
        testList =[(ur"object 5",ur"object five"),
                   (ur"1st", ur"first")]

        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))


        testList =[(ur"18-year-old", ur"18-year-old")]
        f.setKeepNewWords(True)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
Exemple #35
0
    def testEnglish(self):
        testList = [(r"object 5", r"object five"),
                    (r"1st", r"first")]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        testList = [(r"18-year-old", r"18-year-old")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))
Exemple #36
0
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText("".join(string.punctuation + "‰"))
        f.setExpandNumberInWords(False)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&'-/@‰"
        self.assertEqual(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEqual(gt, strResult)
    def testGerman(self):
        testList =[(ur"emmaüs",ur"emmaüs"),
                   (u"mein àrbeit", u"mein àrbeit"),
                   (ur"môchten",ur"môchten"),
                   (ur"mädchen",ur"mädchen"),
                   (ur"meîn",ur"meîn"),
                   (ur"meïn",ur"meïn"),
                   (ur"18-jähriger", ur"achtzehn jähriger")]

        #No new words are kepts
        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

        #New words are kept
        testList =[(ur"18-jähriger", ur"18-jähriger")]
        f.setKeepNewWords(True)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))
    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&' @"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at"
        self.assertEquals(gt, strResult)
Exemple #39
0
class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """
    
    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE      = 'language'
    ID_COUNTER              = 0
    
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
        self.lmPreparationFormula.setKeepNewWords(document.keepNewWords)

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation = False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)
    
    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText = strText,
                                                       removeDots = False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception("Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()
        
        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence: inappropriate length: %d! '%s'" % (len(strText), strText.encode("utf-8") ) )
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, not enough or to many words, wordsNum = %d! '%s'" % ( nbWords, strText.encode("utf-8" ) ) )
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, to many groups of digits! '%s'" % strText.encode("utf-8" ) )
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isValid2ndStage(self):
        """Check validity of sentence, this is the 2nd stage checking.
           Added by Yang WANG on Aug 9, 2016

           Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography .
        """
        strText = self.getTextSentence()
        
        # To filter out web addresses        
        if strText.find( "http" ) >= 0 \
            or strText.find( "www" ) >= 0 \
            or strText.find( "html" ) >= 0 \
            or strText.find( "URL" ) >= 0:
                TextCluster.logger.info("Discard sentence, web address! '%s'" % strText.encode("utf-8" ) )
                return False
                
        # regular expression verification by German orthography:    https://en.wikipedia.org/wiki/German_orthography
        # pattern   = u"^[a-zA-ZäöüÄÖÜ0-9.,?\"'\-]+$"   # All allowed chars
        # pattern   = u"^[a-zA-ZäöüÄÖÜß]+[.|']?$"       # common char of [a-zäöü] with an optional trailing dot or apostrophe '
        pattern     = u"^[a-zA-ZäöüÄÖÜß.']+$"
        # print( pattern.encode( "utf-8" ) )
        
        recmped   = re.compile( pattern.encode("utf-8" ) )   # re compiled
        words     = strText.split( )
        for word in words:
            # German orthography check 
            result = recmped.match( word.encode( "utf-8" ) )
            if result is None:
                TextCluster.logger.info("Discard sentence, disobey German orthography rule (%s)! '%s' in '%s'" \
                    % ( pattern.encode( "utf-8" ), word.encode("utf-8" ), strText.encode("utf-8" ) ) )
                return False
                
            # Check for too long word
            if len( word.encode( "utf-8" ) ) > MAX_WORD_LENGTH:
                TextCluster.logger.info("Discard sentence, too long word '%s' of length '%d'! In '%s'" \
                    % ( word.encode( "utf-8" ), len( word.encode( "utf-8" ) ), strText.encode("utf-8" ) ) )
                
                # For temporary debugging: Output all long words for analysis
                if False:
                    cmd = 'echo "' + word.encode( "utf-8" ) + '" >> long_words.txt'
                    print( cmd )
                    os.system( cmd )
                
                return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)
class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """
    
    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE      = 'language'
    ID_COUNTER              = 0
    
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation = False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)
    
    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText = strText,
                                                       removeDots = False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception("Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()
        
        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence: inappropriate length: %d!" % len(strText))
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, not enough or to many words!")
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, to many groups of digits!")
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)