Python RegexList 예제들, asrt.common.RegularExpressionList.RegexList Python 예제들

예제 #1

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: wolverineq/asrt

    def testApostrophe(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(APOSTHROPHELIST))

        testList = [(u"d'avant",u"d' avant")]

        self.verifyEqual(testList, f, 1)

예제 #2

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: hdubey/asrt

    def testApostrophe(self):
        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(APOSTHROPHELIST))

        testList = [(u"d'avant", u"d' avant")]

        self.verifyEqual(testList, f, 1)

예제 #3

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: wolverineq/asrt

    def testDates(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(DATEREGEXLIST))

        testList = [(u"01.01.2015",u"01 01 2015"),
                    (u"01/01/2015",u"01 01 2015"),
                    (u"01.01.15",u"01 01 15"),]

        self.verifyEqual(testList, f, 0)

예제 #4

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: hdubey/asrt

    def testDates(self):
        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(DATEREGEXLIST))

        testList = [
            (u"01.01.2015", u"01 01 2015"),
            (u"01/01/2015", u"01 01 2015"),
            (u"01.01.15", u"01 01 15"),
        ]

        self.verifyEqual(testList, f, 0)

예제 #5

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: wolverineq/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [(u"ADG SPO PS",u"a. d. g.  s. p. o.  p. s."),
                    (u"ADG SPO PS PDCC",u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
                    (u"A ADG SPO PS PDCCC",u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
                    (u"ABCDs ABCs ABs",u"a. b. c. d. s.  a. b. c. s.  a. b. s.")]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER, u"", resultString, flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

예제 #6

0

파일 보기

파일: FormulaRegularExpression.py 프로젝트: idiap/asrt

    def apply(self, strText, languageId, debug=False):
        """Apply regular expressions to 'strText'.

             return an utf-8 formatted string.
        """
        if len(self.substitutionPatternList) == 0:
            if self.rulesFile == None:
                return strText
            else:
                self.logger.info("Loading regexes from %s" % str(self.rulesFile))
                self.substitutionPatternList = \
                    RegexList.loadFromFile(self.rulesFile)
        
        return RegularExpressionFormula.applyRegularExpressions(strText,
                        self.substitutionPatternList, languageId, debug)

예제 #7

0

파일 보기

파일: DataPreparationAPI.py 프로젝트: ondrejklejch/asrt

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

예제 #8

0

파일 보기

파일: FormulaRegularExpression.py 프로젝트: ondrejklejch/asrt

    def apply(self, strText, languageId, debug=False):
        """Apply regular expressions to 'strText'.

             return an utf-8 formatted string.
        """
        if len(self.substitutionPatternList) == 0:
            if self.rulesFile == None:
                return strText
            else:
                self.logger.info("Loading regexes from %s" %
                                 str(self.rulesFile))
                self.substitutionPatternList = \
                    RegexList.loadFromFile(self.rulesFile)

        return RegularExpressionFormula.applyRegularExpressions(
            strText, self.substitutionPatternList, languageId, debug)

예제 #9

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: hdubey/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.", ur"6", ur"0", ur"")]

        TESTLIST = [(u"ADG", u"a. d. g."), (u"ADG/LA", u"ADG/LA"),
                    (u"a ADG b", u"a a. d. g. b"), (u"l ADG ", u"l a. d. g. "),
                    (u"l'ADG'", u"l'a. d. g.'"), (u"\"ADG\"", u"\"a. d. g.\""),
                    (u"\"ADG", u"\"a. d. g."), (u"e-ADG-", u"e-a. d. g.-"),
                    (u"l'ADG,", u"l'a. d. g.,"), (u"l'ADG.", u"l'a. d. g.."),
                    (u"l'ADG?", u"l'a. d. g.?"), (u"l'ADG!", u"l'a. d. g.!"),
                    (u"l'ADG;", u"l'a. d. g.;"), (u"l'ADG:", u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                                     RegexList.removeComments(TYPEREGEXLIST))

        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

예제 #10

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: wolverineq/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(None,
                RegexList.removeComments(CONTRACTIONPREFIXELIST))
        
        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'), 
                              resultString.encode('utf-8'))

        testList = [(ur"d une",ur"d' une"),(ur"j' ai",ur"j' ai"), (ur"l' y ",ur"l' y "),
                    (ur"m' a",ur"m' a"), (ur"n' est",ur"n' est"),(ur"n' a",ur"n' a"),
                    (ur"d' y",ur"d' y"),(ur"c' en",ur"c' en"), (ur"qu' y",ur"qu' y"),
                    (ur"qu' en",ur"qu' en"), (ur"-t-on",ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), 
                              resultString.encode('utf-8'))

예제 #11

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: hdubey/asrt

    def testAcronyms(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(ACRONYMREGEXLIST))

        testList = [
            (u"ADG SPO PS", u"a. d. g.  s. p. o.  p. s."),
            (u"ADG SPO PS PDCC", u"a. d. g.  s. p. o.  p. s.  p. d. c. c."),
            (u"A ADG SPO PS PDCCC",
             u"A a. d. g.  s. p. o.  p. s.  p. d. c. c. c."),
            (u"ABCDs ABCs ABs", u"a. b. c. d. s.  a. b. c. s.  a. b. s.")
        ]

        for t, gt in testList:
            resultString = f.apply(t, 0, False)
            resultString = re.sub(ACRONYMDELIMITER,
                                  u"",
                                  resultString,
                                  flags=re.UNICODE)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

예제 #12

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: hdubey/asrt

    def testContractionPrefixes(self):
        f = RegularExpressionFormula(
            None, RegexList.removeComments(CONTRACTIONPREFIXELIST))

        for p, s, t, i, c in CONTRACTIONPREFIXELIST:
            if not p.find("gr1"):
                resultString = f.apply(p, 1, False)
                self.assertEquals(s.encode('utf-8'),
                                  resultString.encode('utf-8'))

        testList = [(ur"d une", ur"d' une"), (ur"j' ai", ur"j' ai"),
                    (ur"l' y ", ur"l' y "), (ur"m' a", ur"m' a"),
                    (ur"n' est", ur"n' est"), (ur"n' a", ur"n' a"),
                    (ur"d' y", ur"d' y"), (ur"c' en", ur"c' en"),
                    (ur"qu' y", ur"qu' y"), (ur"qu' en", ur"qu' en"),
                    (ur"-t-on", ur" -t-on")]

        for p, gt in testList:
            resultString = f.apply(p, 1, False)
            self.assertEquals(gt.encode('utf-8'), resultString.encode('utf-8'))

예제 #13

0

파일 보기

파일: FormulaRegexUnitTest.py 프로젝트: idiap/asrt

    def testRegexTypes(self):
        TYPEREGEXLIST = [(ur"ADG", ur"a. d. g.",ur"6",ur"0",ur"")]

        TESTLIST = [(u"ADG",u"a. d. g."),
                    (u"ADG/LA",u"ADG/LA"),
                    (u"a ADG b",u"a a. d. g. b"),
                    (u"l ADG ",u"l a. d. g. "),
                    (u"l'ADG'",u"l'a. d. g.'"),
                    (u"\"ADG\"",u"\"a. d. g.\""),
                    (u"\"ADG",u"\"a. d. g."),
                    (u"e-ADG-",u"e-ADG-"),
                    (u"l'ADG,",u"l'a. d. g.,"),
                    (u"l'ADG.",u"l'a. d. g.."),
                    (u"l'ADG?",u"l'a. d. g.?"),
                    (u"l'ADG!",u"l'a. d. g.!"),
                    (u"l'ADG;",u"l'a. d. g.;"),
                    (u"l'ADG:",u"l'a. d. g.:")]

        f = RegularExpressionFormula(None,
                RegexList.removeComments(TYPEREGEXLIST))
        
        for t, gt in TESTLIST:
            r = f.apply(t, 0)
            self.assertEquals(gt.encode('utf-8'), r.encode('utf-8'))

예제 #14

0

파일 보기

class LMPreparationFormula():
    """Main formula for language modeling text
       preparation.
    """
    logger = logging.getLogger("Asrt.LMPreparationFormula")

    ordDict = {}
    abbreviationsDict = {}

    #Regular expressions formulas
    dateFormula = RegularExpressionFormula(
        None, RegexList.removeComments(DATEREGEXLIST))
    apostropheFormula = RegularExpressionFormula(
        None, RegexList.removeComments(APOSTHROPHELIST))
    contractionPrefixFormula = RegularExpressionFormula(
        None, RegexList.removeComments(CONTRACTIONPREFIXELIST))
    acronymFormula = RegularExpressionFormula(
        None, RegexList.removeComments(ACRONYMREGEXLIST))

    PUNCTUATIONREGEX = re.compile(PUNCTUATIONPATTERN, flags=re.UNICODE)
    ALLPUNCTUATIONSYMBOLS = "".join(PUNCTUATIONEXCLUDE + DOTCOMMAEXCLUDE)

    def __init__(self):
        """Default constructor.
        """
        self.strText = ""
        self.languageId = 0
        self.keepNewWords = False
        self.numberFormula = {
            FRENCH: FrenchNumberFormula,
            GERMAN: GermanNumberFormula
        }

    #####################
    #Getters and setters
    #
    def getText(self):
        return self.strText

    def getLanguageId(self):
        """Return a number between 0 and 4:

           0:'unknown', 1:'French', 2:'German',
           3:'English', 4:'Italian'
        """
        return self.languageId

    def setText(self, strText):
        """Set the underlying text with 'strText'.

           param strText: an utf-8 encoded string
        """
        self.strText = strText

    def setLanguageId(self, languageId):
        """Set the language id.

           param 'languageId': a value between 0 and 4:

           0:'unknown', 1:'French', 2:'German',
           3:'English', 4:'Italian'
        """
        self.languageId = languageId
        LMPreparationFormula.ordDict = {}
        LMPreparationFormula.ordDict = LMPreparationFormula._getOrdDict(
            self.languageId)

    def setKeepNewWords(self, keepNewWords):
        """Keep new words.
        """
        self.keepNewWords = keepNewWords

    ##################
    #Public interface
    #
    def prepareText(self):
        """Prepare 'strText' for language modeling.

           Heuristic is :
                Noise words filtering
                Character based normalization
                Dates normalization
                Language based abbreviations expansion
                Word based normalization
                Acronyms normalization
                Contraction prefixes separation
                Lowercase normalization

            return the normalized text in utf-8 encoding
        """
        #print self.strText
        #Some preprocessing
        self._filterNoiseWords()
        self._normalizeUtf8()
        #Before punctuation removal, some rules
        #are applied
        self._normalizeDates()
        self._expandAbbreviations()

        if not self.keepNewWords:
            self._expandNumberInWords()
            #print self.strText

        #Removal of some of punctuation symbols
        self._normalizePunctuation(PUNCTUATIONEXCLUDE)
        #print self.strText

        #Dot and comman punctuation symbols are still needed
        self._normalizeWords()
        #print self.strText

        self._normalizeContractionPrefixes()
        #print self.strText

        #Make sure no punctuation is remaining
        self._normalizePunctuation(self.ALLPUNCTUATIONSYMBOLS)
        #print self.strText

        if not self.keepNewWords:
            self._expandAcronyms()
            #print self.strText

        self._normalizeCase()
        #print self.strText

        return self.strText

    ##################
    #Implementation
    #
    def _filterNoiseWords(self):
        """Do not keep some words considered as noise.

           For example words consisting of 4 or more punctuation
           characters.
        """
        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)
        newWordsList = []
        for w in wordsList:
            if not LMPreparationFormula._isNoise(w):
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)
        return self.strText

    def _normalizeUtf8(self):
        """Some punctuation characters are normalized.
        """

        languageId = self.getLanguageId()

        #Mapping dictionary
        ordDict = LMPreparationFormula._getOrdDict(languageId)

        utf8List = []
        #Loop through unicode characters
        for i, c in enumerate(self.strText):
            if ord(c) in ordDict:
                utf8List.append(ordDict[ord(c)])
            else:
                utf8List.append(c)

        self.strText = u"".join(utf8List).rstrip().strip()

        if len(self.strText) > 1 and \
               self.strText[-1] in self.ALLPUNCTUATIONSYMBOLS and \
               self.strText[-2].isdigit():
            self.strText = self.strText.rstrip(self.ALLPUNCTUATIONSYMBOLS)

        self.strText = re.sub(SPACEPATTERN,
                              u" ",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizeDates(self):
        """Normalize dates.
        """
        self.strText = self.dateFormula.apply(self.strText, self.languageId)

    def _expandAbbreviations(self):
        """Expand language abbreviations.
        """
        aDict = self._getAbbreviationsDict()
        if self.languageId not in aDict:
            return

        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)
        newWordsList = []
        for w in wordsList:
            wByte = w.encode('utf-8')
            if wByte in aDict[self.languageId]:
                newWordsList.append(aDict[self.languageId][wByte])
            else:
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)

    def _expandNumberInWords(self):
        """If there are numbers in words, split them.

           i.e. A1   --> A. 1
                P3B  --> P. 3 B.
                P5B4 --> P. 5 B. 4
                PPB5 --> PPB 5 (acronyms are expanded later on)
        """
        wordsList = re.split(SPACEPATTERN, self.strText, flags=re.UNICODE)

        newWordsList = []
        for w in wordsList:
            tokenList = re.split(CAPTURINGDIGITPATTERN, w, flags=re.UNICODE)
            #Numbers need to contain a digit
            #Ordinal numbers are not expanded
            if not re.search(u"[0-9]", w) or w.endswith(EXPANDEXCEPTIONS):
                newWordsList.append(w)
            #We have a match
            elif len(tokenList) > 1:
                #Single letter acronyms
                for i, t in enumerate(tokenList):
                    #Digit return false
                    if len(t) == 1 and t.isupper():
                        tokenList[i] = tokenList[i] + u"."
                newWord = u" ".join(tokenList).strip()
                #Group P . 5 into P. 5
                newWord = re.sub(GROUPINGDOTCOMMAPATTERN, u"\g<2> ", newWord)
                newWordsList.append(newWord)
            else:
                newWordsList.append(w)

        self.strText = u" ".join(newWordsList)

    def _expandAcronyms(self):
        """Acronyms are splitted.

           i.e. PDC --> p. d. c.
        """
        self.strText = self.acronymFormula.apply(self.strText, self.languageId)
        self.strText = re.sub(ACRONYMDELIMITER,
                              u"",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizePunctuation(self, excludeList):
        """Some punctuation characters are
           normalized:
           - Removal by spacing
                    - Single, double quotes
                    - Exclamation, Interrogation marks
                    - Braces, round, square, curly
                    - Slashes, back, forward
                    - Sharp symbol
                    - Star, plus, minus
                    - Comma, column, semi-column, dot (keep it for abbreviations)
                    - Lower, greater equal sign
                    - Alone diacritics marks (circumflex accent)
                    - Hyphen, underscore
                    - Back quote
                    - Pipe
                    - Tilde
                - Modification
                    - Percent % --> percent
                    - Ampersand & --> and
                    - At sign @ --> at
                    - Dollars symbol $ --> dollars

            param 'excludeList' : a list of exclude punctuation symbols
        """
        unicodeList, prevC, beforePrevC = [], u"", u""
        for i, c in enumerate(self.strText):
            strC = c.encode('utf-8')
            #For date format, i.e. 21-Jul
            if strC in excludeList:
                #Keep dots after uppercase letters
                if beforePrevC in (""," ") and not prevC.isdigit() \
                    and strC == ".":
                    unicodeList.append(c)
                    unicodeList.append(u" ")
                #Keep some special characters if they appear after a non-space value
                elif self.keepNewWords and prevC not in (
                        "", " ") and strC in PUNCTUATIONKEEPINWORD:
                    unicodeList.append(c)
            elif self.languageId != 0 and strC in PUNCTUATIONMAP:
                unicodeList.append(u" " +
                                   PUNCTUATIONMAP[strC][self.languageId] +
                                   u" ")
            else:
                unicodeList.append(c)
            beforePrevC = prevC
            prevC = strC

        self.strText = u"".join(unicodeList).rstrip().strip()
        self.strText = re.sub(u"(^- *| - |-$)",
                              u"",
                              self.strText,
                              flags=re.UNICODE)
        self.strText = re.sub(u"(- )", u" ", self.strText, flags=re.UNICODE)
        self.strText = re.sub(SPACEPATTERN,
                              u" ",
                              self.strText,
                              flags=re.UNICODE)

    def _normalizeWords(self):
        """Word base normalization.

           This is language dependant.

            - Contraction prefixes, suffixes --> separate
            - Abbreviations --> normalize
            - Acronyms (upper case words) --> split into letters
            - Decimal numbers --> add comma or dot words
            - Ordinal numbers  --> transform
            - Cardinal numbers --> transform
        """
        languageId = self.getLanguageId()
        if languageId not in self.numberFormula:
            #self.logger.warning("LM preparation not implemented for language id %d" % languageId)
            return
        numberFormula = self.numberFormula[languageId]

        self.strText = numberFormula.apply(self.strText)

    def _normalizeContractionPrefixes(self):
        """Contraction prefixes are separated and
           acronyms are normalized.
        """
        self.strText = self.apostropheFormula.apply(self.strText,
                                                    self.languageId)
        self.strText = self.contractionPrefixFormula.apply(
            self.strText, self.languageId, False)

    def _normalizeCase(self):
        """Case normalization (change to lower case)
        """
        self.strText = self.strText.lower()

    @staticmethod
    def _getOrdDict(langId):
        """Utf-8 characters mapping in the form of a
           code point dictionary.
        """
        if len(LMPreparationFormula.ordDict.keys()) > 0:
            return LMPreparationFormula.ordDict

        #Substitution dictionary, assume one character only
        ordDict = {}
        for match, sub, comment, languageId in UTF8MAP:
            if ord(match) in ordDict:
                raise Exception("Already in dictionary '%s' '%s'!" %
                                (unichr(ord(match)), comment.encode('utf8')))
            if (langId == int(languageId) or int(languageId) == 0):
                ordDict[ord(match)] = sub

        LMPreparationFormula.ordDict = ordDict
        return LMPreparationFormula.ordDict

    @staticmethod
    def _getAbbreviationsDict():
        """Get the abbreviations dictionary with keys
           encoded in byte string for comparison.
        """
        if len(LMPreparationFormula.abbreviationsDict.keys()) > 0:
            return LMPreparationFormula.abbreviationsDict

        aDict = {}
        for lang in ABBREVIATIONS.keys():
            if lang not in aDict:
                aDict[lang] = {}
            for k, v in ABBREVIATIONS[lang].items():
                aDict[lang][k.encode('utf-8')] = v

        LMPreparationFormula.abbreviationsDict = aDict
        return LMPreparationFormula.abbreviationsDict

    @staticmethod
    def _isNoise(strWord):
        """Check if 'strWord' is a noise word.

           return True or False
        """
        return LMPreparationFormula.PUNCTUATIONREGEX.search(strWord) != None

    @staticmethod
    def _applyRegexes(strText, regexList):
        for p, r, t in regexList:
            strText = re.sub(p, r, strText, flags=re.UNICODE)
        return strText