Python LMPreparationFormula Exemples, asrt.common.formula.FormulaLMPreparation.LMPreparationFormula Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testFilterNoiseWords(self):
        strTest = u"!-?- hello how !!!! are you *-+$"
        strGt = u"hello how are you"

        f = LMPreparationFormula()
        f.setText(strTest)
        strTest = f._filterNoiseWords()

        self.assertEquals(strGt, strTest)

Exemple #2

0

Afficher le fichier

    def testContractionPrefixes(self):
        testList = [(r"President' s", r"president's", 3),
                    (r"President' s of", r"president's of", 3)]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(False)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #3

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testNormalizeCharacters(self):
        strTest = ur"a b c \uff1b , % œ"
        strGt = ur"a b c % oe"

        f = LMPreparationFormula()
        f.setText(strTest)
        f._normalizeUtf8()
        f._normalizePunctuation(self.allPunctList)
        self.assertEquals(strGt, f.getText())

Exemple #4

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testNormalizePunctuationKeepInWords(self):
        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        f.setText(u"".join("/ HES-SO und AdG/LA - auch im Winter / Sommer -"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "HES-SO und AdG/LA auch im Winter Sommer"
        self.assertEquals(gt, strResult)

Exemple #5

0

Afficher le fichier

    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages:
            testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages:
            gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))

Exemple #6

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : hdubey/asrt

 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))

Exemple #7

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testContractionPrefixes(self):
        testList =[(ur"President' s", ur"president's", 3),
                   (ur"President' s of", ur"president's of", 3)]

        f = LMPreparationFormula()
        f.setKeepNewWords(True)

        for t, gt, languageId in testList:
            f.setLanguageId(languageId)
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #8

0

Afficher le fichier

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

Exemple #9

0

Afficher le fichier

    def testExpandNumberInWords(self):
        testList = [(r"A1", r"A. 1"), (r"P3B", r"P. 3 B."), (r"P5B4", r"P. 5 B. 4"),
                    (r"PPB5", r"PPB 5"), (r"10jährige", r"10 jährige")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setExpandNumberInWords(False)
        testList = [(r"1er", r"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"1st", r"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(r"18-jähriger", r"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)

Exemple #10

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testNormalizeUtf8(self):
        languages = ['0', '1', '2']
        testList = {}
        for lang in languages: testList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): testList[lang].append(match)

        gtList = {}
        for lang in languages: gtList[lang] = []
        for match, sub, comment, languageId in UTF8MAP:
            for lang in languages:
                if (lang == int(languageId)): gtList[lang].append(sub)

        for lang in languages:
            strGt = u" ".join(gtList[lang])
            strGt = strGt.rstrip().strip()
            strGt = re.sub(SPACEPATTERN, u" ",
                            strGt, flags=re.UNICODE)

            f = LMPreparationFormula()
            f.setText(u" ".join(testList[lang]))
            f._normalizeUtf8()
            strResult = f.getText()

            self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))

Exemple #11

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

 def testExpandAbbreviations(self):
     f = LMPreparationFormula()
     for languageId, v in ABBREVIATIONS.items():
         f.setLanguageId(languageId)
         for abbr, gt in v.items():
             f.strText = abbr
             f._expandAbbreviations()
             self.assertEquals(gt.encode('utf-8'), f.strText.encode('utf-8'))

Exemple #12

0

Afficher le fichier

    def testGerman(self):
        testList = [(ur"emmaüs", ur"emmaüs"), (u"mein àrbeit", u"mein àrbeit"),
                    (ur"môchten", ur"môchten"), (ur"mädchen", ur"mädchen"),
                    (ur"meîn", ur"meîn"), (ur"meïn", ur"meïn")]

        f = LMPreparationFormula()
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #13

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"),(ur"P3B", ur"P. 3 B."), (ur"P5B4", ur"P. 5 B. 4"),
                     (ur"PPB5",ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

        f.setKeepNewWords(False)
        testList = [(ur"1er",ur"1er")]
        f.setLanguageId(1)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(ur"1st",ur"1st")]
        f.setLanguageId(3)
        self.verifyEqual(testList, f, f._expandNumberInWords)

        testList = [(ur"18-jähriger", ur"18 -jähriger")]
        f.setLanguageId(2)
        self.verifyEqual(testList, f, f._expandNumberInWords)

Exemple #14

0

Afficher le fichier

    def testFrench(self):
        testList = [(ur"à plus tard", ur"à plus tard"),
                    (ur"maîtres", ur"maîtres"), (ur"maïs", ur"maïs"),
                    (ur"emmaüs", ur"emmaüs"), (ur"mäman", ur"mäman")]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #15

0

Afficher le fichier

    def testAll(self):
        testList = [("A dix heures", "à dix heures", False),
                    ("1. Election", "premièrement election", False),
                    ("R1", "r. un", False), (r"A1", r"a. un",
                                             False), (r"P3B", r"p. trois b.", False),
                    (r"P5B4", r"p. cinq b. quatre", False),
                    (r"PPB5", r"p. p. b. cinq", False),
                    (r"rte", r"route", False),
                    (r"Constantin, p. l. r., président de",
                     r"constantin p. l. r. président de", False),
                    (r"/ HES-SO und AdG/LA - auch im Winter / Sommer -", r"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setExpandNumberInWords(not knw)
            r = f.prepareText()
            self.assertEqual(gt, r)

Exemple #16

0

Afficher le fichier

    def testAll(self):
        testList = [(u"A dix heures", u"à dix heures", False),
                    (u"1. Election", u"premièrement election", False),
                    (u"R1", u"r. un", False), (ur"A1", ur"a. un", False),
                    (ur"P3B", ur"p. trois b.", False),
                    (ur"P5B4", ur"p. cinq b. quatre", False),
                    (ur"PPB5", ur"p. p. b.  cinq", False),
                    (ur"rte", ur"route", False),
                    (ur"Constantin, p. l. r., président de",
                     ur"constantin p. l. r. président de", False),
                    (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",
                     ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #17

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures", False),
                   (u"1. Election",u"premièrement election", False),
                   (u"R1",u"r. un", False), (ur"A1", ur"a. un", False),(ur"P3B", ur"p. trois b.", False),
                   (ur"P5B4", ur"p. cinq b. quatre", False),
                   (ur"PPB5",ur"p. p. b.  cinq", False),
                   (ur"rte",ur"route", False),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de", False),
                   (ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -",ur"hes-so und adg/la auch im winter sommer", True)]

        f = LMPreparationFormula()
        f.setLanguageId(1)

        for t, gt, knw in testList:
            f.setText(t)
            f.setKeepNewWords(knw)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #18

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : hdubey/asrt

    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route"),
                   (ur"Constantin, p. l. r., président de",ur"constantin p. l. r. président de")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #19

0

Afficher le fichier

Fichier : TextCluster.py Projet : d-unknown-processor/asrt

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

Exemple #20

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : wolverineq/asrt

    def testAll(self):
        testList =[(u"A dix heures", u"à dix heures"),
                   (u"1. Election",u"premièrement election"),
                   (u"R1",u"r. un"), (ur"A1", ur"a. un"),(ur"P3B", ur"p. trois b."),
                   (ur"P5B4", ur"p. cinq b. quatre"), 
                   (ur"PPB5",ur"p. p. b.  cinq"),
                   (ur"rte",ur"route")]

        f = LMPreparationFormula()
        f.setLanguageId(1)
        
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #21

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : hdubey/asrt

    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", 
                        strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode('utf-8'), strResult.encode('utf-8'))

Exemple #22

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : d-unknown-processor/asrt

    def testNormalizeUtf8(self):
        testList = []
        for match, sub, comment, languageId in UTF8MAP:
            testList.append(match)

        gtList = []
        for match, sub, comment, languageId in UTF8MAP:
            gtList.append(sub)

        strGt = u" ".join(gtList)
        strGt = strGt.rstrip().strip()
        strGt = re.sub(SPACEPATTERN, u" ", strGt, flags=re.UNICODE)

        f = LMPreparationFormula()
        f.setText(u" ".join(testList))
        f._normalizeUtf8()
        strResult = f.getText()

        self.assertEquals(strGt.encode("utf-8"), strResult.encode("utf-8"))

Exemple #23

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testFrench(self):
        testList =[(ur"à plus tard",ur"à plus tard"),
                   (ur"maîtres",ur"maîtres"),
                   (ur"maïs",ur"maïs"),
                   (ur"emmaüs",ur"emmaüs"),
                   (ur"mäman",ur"mäman"),
                   (ur"1er", ur"premier"),
                   (ur"20ème", ur"vingtième"),
                   (ur"18-age", ur"dix huit age")]

        #No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setKeepNewWords(True)

        testList =[(ur"18-age", ur"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #24

0

Afficher le fichier

    def testExpandAcronyms(self):
        testList = [(u"PDCB.", u"p. d. c. b."), (u"PDC:", u"p. d. c.")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandAcronyms)

Exemple #25

0

Afficher le fichier

    def testExpandNumberInWords(self):
        testList = [(ur"A1", ur"A. 1"), (ur"P3B", ur"P. 3 B."),
                    (ur"P5B4", ur"P. 5 B. 4"), (ur"PPB5", ur"PPB 5")]

        f = LMPreparationFormula()
        self.verifyEqual(testList, f, f._expandNumberInWords)

Exemple #26

0

Afficher le fichier

Fichier : TextCluster.py Projet : colincwilson/asrt

class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """

    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE = 'language'
    ID_COUNTER = 0

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        # Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        # Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        # Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        # Actual data
        self.addElement(sentenceText)

        # LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
        self.lmPreparationFormula.setExpandNumberInWords(
            document.expandNumberInWords)

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation=False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return "---\n%s\n" % "\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    # Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)

    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText=strText,
                                                       removeDots=False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception(
                "Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()

        self.setTextSentence(strText)

    #####################
    # Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        # Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            # print strText
            TextCluster.logger.info(
                "Discard sentence: inappropriate length: %d! '%s'" %
                (len(strText)))
            return False

        # Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            # print strText
            TextCluster.logger.info(
                "Discard sentence, not enough or to many words, wordsNum = %d! '%s'"
                % (nbWords, strText))
            return False

        # Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            # print strText
            TextCluster.logger.info(
                "Discard sentence, to many groups of digits! '%s'" % strText)
            return False

        # Try decode
        # Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isValid2ndStage(self):
        """Check validity of sentence, this is the 2nd stage checking.
           Added by Yang WANG on Aug 9, 2016

           Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography .
        """
        strText = self.getTextSentence()

        # To filter out web addresses
        if strText.find( "http" ) >= 0 \
                or strText.find( "www" ) >= 0 \
                or strText.find( "html" ) >= 0 \
                or strText.find("URL") >= 0:
            TextCluster.logger.info("Discard sentence, web address! '%s'" %
                                    strText)
            return False

        # regular expression verification by German orthography:    https://en.wikipedia.org/wiki/German_orthography
        # pattern   = u"^[a-zA-ZäöüÄÖÜ0-9.,?\"'\-]+$"   # All allowed chars
        # pattern   = u"^[a-zA-ZäöüÄÖÜß]+[.|']?$"       # common char of
        # [a-zäöü] with an optional trailing dot or apostrophe '
        pattern = "^[a-zA-ZäöüÄÖÜß.']+$"
        # print( pattern )

        recmped = re.compile(pattern)  # re compiled
        words = strText.split()
        for word in words:
            # German orthography check
            result = recmped.match(word)
            if result is None:
                TextCluster.logger.info(
                    "Discard sentence, disobey German orthography rule (%s)! '%s' in '%s'"
                    % (pattern, word, strText))
                return False

            # Check for too long word
            if len(word) > MAX_WORD_LENGTH:
                TextCluster.logger.info(
                    "Discard sentence, too long word '%s' of length '%d'! In '%s'"
                    % (word, len(word), strText))

                # For temporary debugging: Output all long words for analysis
                if False:
                    cmd = 'echo "' + \
                        word + '" >> long_words.txt'
                    print(cmd)
                    os.system(cmd)

                return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
            ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        # Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            # Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            # Ignore case available
            # if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex, strText))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return key

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)

Exemple #27

0

Afficher le fichier

 def testIsNoise(self):
     for p in list(string.punctuation):
         strTest = p * 4
         self.assertTrue(LMPreparationFormula._isNoise(strTest))

Exemple #28

0

Afficher le fichier

class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """

    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE = 'language'
    ID_COUNTER = 0

    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation=False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)

    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText=strText,
                                                       removeDots=False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception(
                "Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()

        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence: inappropriate length: %d!" % len(strText))
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence, not enough or to many words!")
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info(
                "Discard sentence, to many groups of digits!")
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info(
                    "Discard:%s\n%s" %
                    (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)

Exemple #29

0

Afficher le fichier

    def testFrench(self):
        testList = [(r"à plus tard", r"à plus tard"),
                    (r"maîtres", r"maîtres"),
                    (r"maïs", r"maïs"),
                    (r"emmaüs", r"emmaüs"),
                    (r"mäman", r"mäman"),
                    (r"1er", r"premier"),
                    (r"20ème", r"vingtième"),
                    (r"18-age", r"dix huit age")]

        # No new words are kepts, hyphens are removed
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(1)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # Keep new words implies keep hyphens in words
        f.setExpandNumberInWords(False)

        testList = [(r"18-age", r"18-age")]
        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #30

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

 def testIsNoise(self):
     for p in list(string.punctuation):
         strTest = p*4
         self.assertTrue(LMPreparationFormula._isNoise(strTest))

Exemple #31

0

Afficher le fichier

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at pour mille"
        self.assertEquals(gt, strResult)

Exemple #32

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation + u"‰"))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = u"$%&'-/@\u2030"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEquals(gt, strResult)

Exemple #33

0

Afficher le fichier

    def testGerman(self):
        testList = [(r"emmaüs", r"emmaüs"),
                    ("mein àrbeit", "mein àrbeit"),
                    (r"môchten", r"môchten"),
                    (r"mädchen", r"mädchen"),
                    (r"meîn", r"meîn"),
                    (r"meïn", r"meïn"),
                    (r"18-jähriger", r"achtzehn jähriger")]

        # No new words are kepts
        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        # New words are kept
        testList = [(r"18-jähriger", r"18-jähriger")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #34

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testEnglish(self):
        testList =[(ur"object 5",ur"object five"),
                   (ur"1st", ur"first")]

        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))


        testList =[(ur"18-year-old", ur"18-year-old")]
        f.setKeepNewWords(True)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #35

0

Afficher le fichier

    def testEnglish(self):
        testList = [(r"object 5", r"object five"),
                    (r"1st", r"first")]

        f = LMPreparationFormula()
        f.setExpandNumberInWords(True)
        f.setLanguageId(3)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

        testList = [(r"18-year-old", r"18-year-old")]
        f.setExpandNumberInWords(False)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEqual(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #36

0

Afficher le fichier

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText("".join(string.punctuation + "‰"))
        f.setExpandNumberInWords(False)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&'-/@‰"
        self.assertEqual(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et '-/ at pour mille"
        self.assertEqual(gt, strResult)

Exemple #37

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : idiap/asrt

    def testGerman(self):
        testList =[(ur"emmaüs",ur"emmaüs"),
                   (u"mein àrbeit", u"mein àrbeit"),
                   (ur"môchten",ur"môchten"),
                   (ur"mädchen",ur"mädchen"),
                   (ur"meîn",ur"meîn"),
                   (ur"meïn",ur"meïn"),
                   (ur"18-jähriger", ur"achtzehn jähriger")]

        #No new words are kepts
        f = LMPreparationFormula()
        f.setKeepNewWords(False)
        f.setLanguageId(2)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

        #New words are kept
        testList =[(ur"18-jähriger", ur"18-jähriger")]
        f.setKeepNewWords(True)

        for t, gt in testList:
            f.setText(t)
            r = f.prepareText()
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

Exemple #38

0

Afficher le fichier

Fichier : FormulaLMPreparationUnitTest.py Projet : d-unknown-processor/asrt

    def testNormalizePunctuation(self):
        f = LMPreparationFormula()
        f.setText(u"".join(string.punctuation))
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "$%&' @"
        self.assertEquals(gt, strResult)

        f.setLanguageId(1)
        f._normalizePunctuation(self.allPunctList)
        strResult = f.getText()

        gt = "dollars pourcent et ' at"
        self.assertEquals(gt, strResult)

Exemple #39

0

Afficher le fichier

Fichier : TextCluster.py Projet : idiap/asrt

class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """
    
    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE      = 'language'
    ID_COUNTER              = 0
    
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()
        self.lmPreparationFormula.setKeepNewWords(document.keepNewWords)

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation = False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)
    
    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText = strText,
                                                       removeDots = False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception("Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()
        
        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence: inappropriate length: %d! '%s'" % (len(strText), strText.encode("utf-8") ) )
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, not enough or to many words, wordsNum = %d! '%s'" % ( nbWords, strText.encode("utf-8" ) ) )
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, to many groups of digits! '%s'" % strText.encode("utf-8" ) )
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isValid2ndStage(self):
        """Check validity of sentence, this is the 2nd stage checking.
           Added by Yang WANG on Aug 9, 2016

           Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography .
        """
        strText = self.getTextSentence()
        
        # To filter out web addresses        
        if strText.find( "http" ) >= 0 \
            or strText.find( "www" ) >= 0 \
            or strText.find( "html" ) >= 0 \
            or strText.find( "URL" ) >= 0:
                TextCluster.logger.info("Discard sentence, web address! '%s'" % strText.encode("utf-8" ) )
                return False
                
        # regular expression verification by German orthography:    https://en.wikipedia.org/wiki/German_orthography
        # pattern   = u"^[a-zA-ZäöüÄÖÜ0-9.,?\"'\-]+$"   # All allowed chars
        # pattern   = u"^[a-zA-ZäöüÄÖÜß]+[.|']?$"       # common char of [a-zäöü] with an optional trailing dot or apostrophe '
        pattern     = u"^[a-zA-ZäöüÄÖÜß.']+$"
        # print( pattern.encode( "utf-8" ) )
        
        recmped   = re.compile( pattern.encode("utf-8" ) )   # re compiled
        words     = strText.split( )
        for word in words:
            # German orthography check 
            result = recmped.match( word.encode( "utf-8" ) )
            if result is None:
                TextCluster.logger.info("Discard sentence, disobey German orthography rule (%s)! '%s' in '%s'" \
                    % ( pattern.encode( "utf-8" ), word.encode("utf-8" ), strText.encode("utf-8" ) ) )
                return False
                
            # Check for too long word
            if len( word.encode( "utf-8" ) ) > MAX_WORD_LENGTH:
                TextCluster.logger.info("Discard sentence, too long word '%s' of length '%d'! In '%s'" \
                    % ( word.encode( "utf-8" ), len( word.encode( "utf-8" ) ), strText.encode("utf-8" ) ) )
                
                # For temporary debugging: Output all long words for analysis
                if False:
                    cmd = 'echo "' + word.encode( "utf-8" ) + '" >> long_words.txt'
                    print( cmd )
                    os.system( cmd )
                
                return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)

Exemple #40

0

Afficher le fichier

Fichier : TextCluster.py Projet : d-unknown-processor/asrt

class TextCluster(Cluster):
    """Concrete type representing a text sentence from
       a bilingual pdf document.

       Sentences are stored in utf-8 encoding.
    """
    
    logger = logging.getLogger("Asrt.TextCluster")

    LANGUAGE_ATTRIBUTE      = 'language'
    ID_COUNTER              = 0
    
    def __init__(self, document, sentenceText):
        """Constructor.
        """
        #Give a unique id to cluster
        TextCluster.ID_COUNTER += 1

        #Unknown language
        attributesList = [(TextCluster.LANGUAGE_ATTRIBUTE, 0)]

        #Key is mlf pattern
        Cluster.__init__(self, str(TextCluster.ID_COUNTER), attributesList)

        self.document = document

        #Actual data
        self.addElement(sentenceText)

        #LM normalization
        self.lmPreparationFormula = LMPreparationFormula()

    #####################
    #Getters and setters
    #
    def getTextSentence(self, noPunctuation = False, debug=False):
        """Return the associated utf-8 text sentence.
        """
        if len(self.elementList) == 0:
            return ""

        if debug:
            return u"---\n%s\n" % u"\n".join(reversed(self.elementList))

        return self.elementList[0]

    def getLanguageId(self):
        """Get the cluster language id.

           Return an integer between 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        strLanguage = self.getAttribute(self.LANGUAGE_ATTRIBUTE)
        return LANGUAGE2ID[strLanguage]

    def setTextSentence(self, textSentence):
        """Set the new text.

           param textSentence: an utf-8 encoded string
        """
        self.elementList[0] = textSentence

    def setLanguage(self, languageId):
        """Language for sentence.

           param 'languageId' : a value between 0 and 4
            unknown : 0
            french  : 1
            german  : 2
            english : 3
            italian : 4
        """
        if languageId > 4 or languageId < 0:
            raise Exception("Unknown language")

        strLanguage = LANGUAGEID2LABELS[languageId]
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, strLanguage)

    #####################
    #Public interface
    #
    def clean(self):
        """Perform text cleaning.

           Heuristic is:
              - remove control characters
              - normalize spaces to one space
              - strip spaces from beginning and end of string
        """
        strText = self.getTextSentence()
        strText = TextCluster.removeControlCharacters(strText)
        self.setTextSentence(strText)
    
    def classify(self, classifier):
        """Classify between french and german.
        """
        l, score = classifier.classify(self.getTextSentence())
        self.setAttribute(TextCluster.LANGUAGE_ATTRIBUTE, l)

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        strText = self.getTextSentence()
        strText = LanguageClassifier.removePunctuation(strText = strText,
                                                       removeDots = False)
        self.setTextSentence(strText)

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        if self.isFrench():
            p = Punctuation()
            self.setTextSentence(p.replaceText(self.getTextSentence()))
        else:
            raise Exception("Text verbalization is only implemented for French!")

    def prepareLM(self):
        """Prepare for language modeling.
        """
        strText = self.getTextSentence()
        languageId = self.getLanguageId()

        self.lmPreparationFormula.setText(strText)
        self.lmPreparationFormula.setLanguageId(languageId)
        strText = self.lmPreparationFormula.prepareText()
        
        self.setTextSentence(strText)

    #####################
    #Predicates
    #
    def isValid(self):
        """Check validity of sentence.

           Heuristic is:
            - sentence length
            - number of digits groups
        """
        strText = self.getTextSentence()

        #Nb characters
        if len(strText) > MAX_SENTENCE_LENGTH or\
           len(strText) < MIN_SENTENCE_LENGTH:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence: inappropriate length: %d!" % len(strText))
            return False

        #Nb words
        nbWords = len(strText.split(' '))
        if nbWords < MIN_WORDS_COUNT or \
           nbWords > MAX_WORDS_COUNT:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, not enough or to many words!")
            return False

        #Nb digit groups
        if len(re.split("\d+", strText)) > MAX_DIGITS_GROUPS:
            #print strText.encode('utf-8')
            TextCluster.logger.info("Discard sentence, to many groups of digits!")
            return False

        #Try decode
        #Use some regex
        if not self._isTextValid(strText):
            return False

        return True

    def isFrench(self):
        """Content is French
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     FRENCH_LABEL

    def isGerman(self):
        """Content is German
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     GERMAN_LABEL

    def isItalian(self):
        """Content is Italian
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ITALIAN_LABEL

    def isEnglish(self):
        """Content is English
        """
        return self.getAttribute(TextCluster.LANGUAGE_ATTRIBUTE) ==\
                     ENGLISH_LABEL

    def getClusterInfo(self):
        """Return key.
        """
        return "[%s] %s" % (self.key, self.getTextSentence())

    ########################
    # Implementation
    #
    def _isTextValid(self, strText):
        """Assess the validity of the text using
           a set of regex rules.

           'strText' is in utf-8 encoding
        """
        clusterLanguageId = self.getLanguageId()

        #Some regex
        for regex, regexLanguageId in self.document.regex_filter_list:
            regexLanguageId = int(regexLanguageId)
            #Does it match the text language
            if regexLanguageId != clusterLanguageId and \
               regexLanguageId != 0:
                continue
            #Ignore case available
            #if re.search(regex, strText, re.IGNORECASE) != None:
            if re.search(regex, strText, flags=re.UNICODE) != None:
                TextCluster.logger.info("Discard:%s\n%s" % (regex.encode("utf-8"), strText.encode("utf-8")))
                return False

        return True

    def __str__(self):
        """Override built in method.
        """
        key = self.getClusterInfo()
        return str(key.encode('utf-8'))

    ########################
    # Implementation
    #
    @staticmethod
    def normalizeText(textUtterance):
        """Normalize text:

           - remove new line character at the end
           - remove prepended and trailing spaces
        """
        textUtterance = textUtterance.rstrip().strip()
        return textUtterance

    @staticmethod
    def removeControlCharacters(str):
        """Control characters are replaced
           by spaces.
        """
        lineList = []
        for ch in str:
            if unicodedata.category(ch)[0] != "C":
                lineList.append(ch)
            else:
                lineList.append(' ')

        return "".join(lineList)