Ejemplo n.º 1
0
 def trainClassifier(self):
     """Train the underlying classifier.
     """
     if self.wordClassifier == None:
         self.logger.info("Prepare the word classifier ...")
         self.wordClassifier = WordClassifier()
         self.wordClassifier.train()
Ejemplo n.º 2
0
    def classifySentences(self):
        """Classify sentences by language (FRENCH or
           GERMAN, ITALIAN or ENGLISH).
        """
        if self.classifier == None:
            self.classifier = WordClassifier()
            self.classifier.train()

        for textCluster in self.listContent:
            textCluster.classify(self.classifier)
Ejemplo n.º 3
0
 def trainClassifier(self):
     """Train the underlying classifier.
     """
     if self.wordClassifier == None:
         self.logger.info("Prepare the word classifier ...")
         self.wordClassifier = WordClassifier()
         self.wordClassifier.train()
Ejemplo n.º 4
0
    def classifySentences(self):
        """Classify sentences by language (FRENCH or
           GERMAN, ITALIAN or ENGLISH).
        """
        if self.classifier == None:
            self.classifier = WordClassifier()
            self.classifier.train()

        for textCluster in self.listContent:
            textCluster.classify(self.classifier)
Ejemplo n.º 5
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger  = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set the acronyms to be used.

           param acronymList: a list of the following form:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        substitutionList = []

        #Skip header
        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0],row[3]))
            else:
                substitutionList.append((row[0],row[1],row[2],row[3]))
            
        self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList)

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()
    
    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def prepareDocument(self, language = 0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language> 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList,
                                    self.outputDir)
            
            if self.inputFile != None:
                self.logger.info("Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info("Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()
            
            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)
            
            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc
Ejemplo n.º 6
0
class TextDocument(Document):
    """A text document.
    """
    logger              = logging.getLogger("Asrt.TextDocument")

    CONVERT_COMMAND     = ['pdftotext', '-raw', '-layout', '-enc', 'UTF-8', '-eol', 'unix', '-nopgbrk']

    MERGECLUSTERSEP     = u"\n"
    DIGITANDDOTREGEX    = u"( |^)([0-9]{1,2})[.]( |$)"
    DIGITANDDOTSUB      = u"\g<1>\g<2>.\g<3>"
    #Do not put a ; for character entity, otherwise
    #sentence segmentation is ocurring
    DIGITANDENTITYREGEX = u"( |^)([0-9]{1,2})&#46( |$)"
    DIGITANDENTITYSUB   = u"\g<1>\g<2>&#46\g<3>"
    
    ########################
    # Default constructor
    #
    def __init__(self, source, languageId,
                 regexSubstitutionFormula, regex_filter_list,
                 logDir, segmentWithNLTK, keepNewWords):
        Document.__init__(self, source)

        self.languageId = languageId
        self.regexSubstitutionFormula = regexSubstitutionFormula
        self.regex_filter_list = regex_filter_list
        self.logDir = logDir
        self.classifier = None
        self.segmentWithNLTK = segmentWithNLTK
        self.keepNewWords = keepNewWords

    ########################
    #Getter and setters
    #
    def setClassifier(self, classifier):
        """Set the language classifier.
           It assumes it has been trained.
        """
        self.classifier = classifier

    def setSentencesLanguage(self, languageId):
        """Language is known.

           param 'languageId': a value beetween 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        for textCluster in self.listContent:
            textCluster.setLanguage(languageId)

    ########################
    #Interface
    #
    def loadDocumentAsSentences(self, tempDir):
        """Convert to text, remove new lines and
           segment into sentences using NLTK
           toolkit.
        """
        #Pdf to text
        tempFileName = self.convertToText(self.sourceFileName, tempDir, self.logDir)

        #Segment into sentences using NLTK toolkit
        self._loadTextDocumentAsSentences(tempFileName)

        #Delete temporary file
        MyFile(tempFileName).removeFile(tempFileName)

    def loadAsSentences(self, strText):
        """Load the given text string as sentences.

           param strText: an utf-8 encoded string
        """
        self._loadAsSentences(strText)

    def getCleanedText(self):
        """Get the cleaned text.
        """
        textList = []
        for textCluster in self.listContent:
            textList.append(textCluster.getTextSentence())

        return self.MERGECLUSTERSEP.join(textList)

    def getCleanedTextPerLanguage(self):
        """Get the classified text per language.

           return a dictionary of utf-8 text.
        """
        textDict = {}
        for textCluster in self.listContent:
            languageId = textCluster.getLanguageId()
            if languageId not in textDict:
                textDict[languageId] = []
            textDict[languageId].append(textCluster.getTextSentence())
        
        #One string per language
        resultDict = {}
        for k, textList in textDict.items():
            resultDict[k] = self.MERGECLUSTERSEP.join(textList)
        return resultDict

    def cleanTextSentences(self):
        """Use a set of regex rules to prepare
           the sentences.
        """
        self._applyAllClusters('clean')

    def normalizeTextSentences(self):
        """Use a set of regex rules to prepare
           the sentences.

           First group clusters per languages and then
           apply language based normalization.
        """
        #Get cluster per language
        lang2clusterDict = self._getLanguage2ClustersDict()

        bEmpty = True

        #Normalize text per language
        for languageId, clusterList in lang2clusterDict.items():
            #Read all cluster texts
            textList = []
            for textCluster in clusterList:
                textList.append(textCluster.getTextSentence())

            #Join all text
            allText = self.MERGECLUSTERSEP.join(textList)

            #Normalize text
            allText = self.regexSubstitutionFormula.apply(allText, languageId)
            sentencesList = allText.split(self.MERGECLUSTERSEP)

            #Add and set language id
            self._addSentences(sentencesList, languageId, bEmpty)

            if bEmpty:
                bEmpty = False

    def prepareLM(self):
        """Prepare text sentences for N-Gram modeling.
        """
        self._applyAllClusters("prepareLM")

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        self._applyAllClusters("removeTextPunctuation")

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        self._applyAllClusters("verbalizeTextPunctuation")

    def filterTextSentences(self):
        """Filter sentences after cleaning.

           Uses:
            - sentence length
            - number of digit groups
            - user defined rules
        """
        filteredContentList = []
        for textCluster in self.listContent:
            if textCluster.isValid():
                filteredContentList.append(textCluster)

        self.listContent = filteredContentList
        filteredContentList = [ ]

    def filterTextSentences2ndStage(self):
        """Filter sentences before LM preparation.
        
           Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography .
        """
        filteredContentList = []
        for textCluster in self.listContent:
            if textCluster.isValid2ndStage():
                filteredContentList.append(textCluster)

        self.listContent = filteredContentList
        filteredContentList = [ ]

    def classifySentences(self):
        """Classify sentences by language (FRENCH or
           GERMAN, ITALIAN or ENGLISH).
        """
        if self.classifier == None:
            self.classifier = WordClassifier()
            self.classifier.train()

        for textCluster in self.listContent:
            textCluster.classify(self.classifier)

    def display(self):
        """Display document content.
        """
        for textCluster in self.listContent:
            print textCluster

    ########################
    #Implementation
    #
    def _loadTextDocumentAsSentences(self, filePath):
        """Load a text document and segment
           it into sentences using NLTK.

           Initial new lines are first removed.
        """
        io = Ioread()

        #One string for the whole
        #text file as utf-8 string
        data = io.nltkRead(filePath)
        self._loadAsSentences(data)

    def _loadAsSentences(self, strText):
        """Load the given text as sentences.

           Algorithm is:
             - New lines removal
             - Problematic periods replacement
             - Sentences segmentation with nltk
             - Problematic periods restauration

           param strText: an utf-8 encoded string
        """
        tokenizer_path = FRENCH_PICKLE_FOLDER
        if self.languageId == 2:
            tokenizer_path = GERMAN_PICKLE_FOLDER

        sentences = []
        if self.segmentWithNLTK:
            TextDocument.logger.info("Segment with NLTK")
            #Trim new lines
            strText = self._replaceNewLines(strText)

            #Problematic periods replacement
            strText = self._replaceProblematicPeriods(strText)

            #Nltk segmentation
            sentences = self._segmentIntoSentences(strText, tokenizer_path)

            #Problematic periods restauration
            for i, s in enumerate(sentences):
                sentences[i] = self._replaceProblematicPeriods(s, forward=False)
        else:
            TextDocument.logger.info("Segment with new lines")
            sentences = strText.split(u"\n")

        #Make text clusters with unknown language id
        self._addSentences(sentences)

        TextDocument.logger.info("Loaded %d raw sentences!" % len(sentences))

    def _applyAllClusters(self, method):
        """Apply 'method' to all clusters.
        """
        for textCluster in self.listContent:
            getattr(textCluster, method)()

    def _replaceNewLines(self, data):
        """Replace new lines by spaces.

           New lines are not considered at the end
           of a sentence.

           param data: an utf-8 encoded string
        """
        #Last sentence word splited into two
        data = re.sub(ur"-\n", u"", data, flags=re.UNICODE)
        
        return re.sub(ur"\n", u" ", data, flags=re.UNICODE)

    def _replaceProblematicPeriods(self, data, forward=True):
        """Convert dots preceded from a number and followed
           by a space into an html entity.

           If forward is set to False, it will convert from
           html entity to dots.

           This escaping is done in order to prevent
           segmenting sentences on numbers.
        """
        if not forward:
            return re.sub(self.DIGITANDENTITYREGEX, self.DIGITANDDOTSUB, data, 
                           flags=re.UNICODE)
    
        return re.sub(self.DIGITANDDOTREGEX, self.DIGITANDENTITYSUB, data, 
                        flags=re.UNICODE)

    def _segmentIntoSentences(self, data, tokenizer_path):
        """Replace current content by sentences.

           The sentences segmentation is done using
           the french pickle of the NLTK toolkit.

           param data: an utf-8 encoded string
        """
        try:

            #Get the french tokenizer
            tokenizer = nltk.data.load(tokenizer_path)

            #The actual job
            sentences = tokenizer.tokenize(data)

        except Exception, e:
            TextDocument.logger.critical("Tokenizer error: " + str(e))
            raise Exception("Tokenizer error: " + self.tokenizer_path)

        return sentences
Ejemplo n.º 7
0
class TextDocument(Document):
    """A text document.
    """
    logger = logging.getLogger("Asrt.TextDocument")

    CONVERT_COMMAND = [
        'pdftotext', '-raw', '-layout', '-enc', 'UTF-8', '-eol', 'unix',
        '-nopgbrk'
    ]

    MERGECLUSTERSEP = u"\n"
    DIGITANDDOTREGEX = u"( |^)([0-9]{1,2})[.]( |$)"
    DIGITANDDOTSUB = u"\g<1>\g<2>.\g<3>"
    #Do not put a ; for character entity, otherwise
    #sentence segmentation is ocurring
    DIGITANDENTITYREGEX = u"( |^)([0-9]{1,2})&#46( |$)"
    DIGITANDENTITYSUB = u"\g<1>\g<2>&#46\g<3>"

    ########################
    # Default constructor
    #
    def __init__(self, source, languageId, regexSubstitutionFormula,
                 regex_filter_list, logDir, segmentWithNLTK, keepNewWords):
        Document.__init__(self, source)

        self.languageId = languageId
        self.regexSubstitutionFormula = regexSubstitutionFormula
        self.regex_filter_list = regex_filter_list
        self.logDir = logDir
        self.classifier = None
        self.segmentWithNLTK = segmentWithNLTK
        self.keepNewWords = keepNewWords

    ########################
    #Getter and setters
    #
    def setClassifier(self, classifier):
        """Set the language classifier.
           It assumes it has been trained.
        """
        self.classifier = classifier

    def setSentencesLanguage(self, languageId):
        """Language is known.

           param 'languageId': a value beetween 0-4
             unknown : 0
             french  : 1
             german  : 2
             english : 3
             italian : 4
        """
        for textCluster in self.listContent:
            textCluster.setLanguage(languageId)

    ########################
    #Interface
    #
    def loadDocumentAsSentences(self, tempDir):
        """Convert to text, remove new lines and
           segment into sentences using NLTK
           toolkit.
        """
        #Pdf to text
        tempFileName = self.convertToText(self.sourceFileName, tempDir,
                                          self.logDir)

        #Segment into sentences using NLTK toolkit
        self._loadTextDocumentAsSentences(tempFileName)

        #Delete temporary file
        MyFile(tempFileName).removeFile(tempFileName)

    def loadAsSentences(self, strText):
        """Load the given text string as sentences.

           param strText: an utf-8 encoded string
        """
        self._loadAsSentences(strText)

    def getCleanedText(self):
        """Get the cleaned text.
        """
        textList = []
        for textCluster in self.listContent:
            textList.append(textCluster.getTextSentence())

        return self.MERGECLUSTERSEP.join(textList)

    def getCleanedTextPerLanguage(self):
        """Get the classified text per language.

           return a dictionary of utf-8 text.
        """
        textDict = {}
        for textCluster in self.listContent:
            languageId = textCluster.getLanguageId()
            if languageId not in textDict:
                textDict[languageId] = []
            textDict[languageId].append(textCluster.getTextSentence())

        #One string per language
        resultDict = {}
        for k, textList in textDict.items():
            resultDict[k] = self.MERGECLUSTERSEP.join(textList)
        return resultDict

    def cleanTextSentences(self):
        """Use a set of regex rules to prepare
           the sentences.
        """
        self._applyAllClusters('clean')

    def normalizeTextSentences(self):
        """Use a set of regex rules to prepare
           the sentences.

           First group clusters per languages and then
           apply language based normalization.
        """
        #Get cluster per language
        lang2clusterDict = self._getLanguage2ClustersDict()

        bEmpty = True

        #Normalize text per language
        for languageId, clusterList in lang2clusterDict.items():
            #Read all cluster texts
            textList = []
            for textCluster in clusterList:
                textList.append(textCluster.getTextSentence())

            #Join all text
            allText = self.MERGECLUSTERSEP.join(textList)

            #Normalize text
            allText = self.regexSubstitutionFormula.apply(allText, languageId)
            sentencesList = allText.split(self.MERGECLUSTERSEP)

            #Add and set language id
            self._addSentences(sentencesList, languageId, bEmpty)

            if bEmpty:
                bEmpty = False

    def prepareLM(self):
        """Prepare text sentences for N-Gram modeling.
        """
        self._applyAllClusters("prepareLM")

    def removeTextPunctuation(self):
        """Remove punctuation symbols.
        """
        self._applyAllClusters("removeTextPunctuation")

    def verbalizeTextPunctuation(self):
        """Transform punctuation symbols to words.
           Currently only implemented for French.
        """
        self._applyAllClusters("verbalizeTextPunctuation")

    def filterTextSentences(self):
        """Filter sentences after cleaning.

           Uses:
            - sentence length
            - number of digit groups
            - user defined rules
        """
        filteredContentList = []
        for textCluster in self.listContent:
            if textCluster.isValid():
                filteredContentList.append(textCluster)

        self.listContent = filteredContentList

    def classifySentences(self):
        """Classify sentences by language (FRENCH or
           GERMAN, ITALIAN or ENGLISH).
        """
        if self.classifier == None:
            self.classifier = WordClassifier()
            self.classifier.train()

        for textCluster in self.listContent:
            textCluster.classify(self.classifier)

    def display(self):
        """Display document content.
        """
        for textCluster in self.listContent:
            print textCluster

    ########################
    #Implementation
    #
    def _loadTextDocumentAsSentences(self, filePath):
        """Load a text document and segment
           it into sentences using NLTK.

           Initial new lines are first removed.
        """
        io = Ioread()

        #One string for the whole
        #text file as utf-8 string
        data = io.nltkRead(filePath)
        self._loadAsSentences(data)

    def _loadAsSentences(self, strText):
        """Load the given text as sentences.

           Algorithm is:
             - New lines removal
             - Problematic periods replacement
             - Sentences segmentation with nltk
             - Problematic periods restauration

           param strText: an utf-8 encoded string
        """
        tokenizer_path = FRENCH_PICKLE_FOLDER
        if self.languageId == 2:
            tokenizer_path = GERMAN_PICKLE_FOLDER

        sentences = []
        if self.segmentWithNLTK:
            TextDocument.logger.info("Segment with NLTK")
            #Trim new lines
            strText = self._replaceNewLines(strText)

            #Problematic periods replacement
            strText = self._replaceProblematicPeriods(strText)

            #Nltk segmentation
            sentences = self._segmentIntoSentences(strText, tokenizer_path)

            #Problematic periods restauration
            for i, s in enumerate(sentences):
                sentences[i] = self._replaceProblematicPeriods(s,
                                                               forward=False)
        else:
            TextDocument.logger.info("Segment with new lines")
            sentences = strText.split(u"\n")

        #Make text clusters with unknown language id
        self._addSentences(sentences)

        TextDocument.logger.info("Loaded %d raw sentences!" % len(sentences))

    def _applyAllClusters(self, method):
        """Apply 'method' to all clusters.
        """
        for textCluster in self.listContent:
            getattr(textCluster, method)()

    def _replaceNewLines(self, data):
        """Replace new lines by spaces.

           New lines are not considered at the end
           of a sentence.

           param data: an utf-8 encoded string
        """
        #Last sentence word splited into two
        data = re.sub(ur"-\n", u"", data, flags=re.UNICODE)

        return re.sub(ur"\n", u" ", data, flags=re.UNICODE)

    def _replaceProblematicPeriods(self, data, forward=True):
        """Convert dots preceded from a number and followed
           by a space into an html entity.

           If forward is set to False, it will convert from
           html entity to dots.

           This escaping is done in order to prevent
           segmenting sentences on numbers.
        """
        if not forward:
            return re.sub(self.DIGITANDENTITYREGEX,
                          self.DIGITANDDOTSUB,
                          data,
                          flags=re.UNICODE)

        return re.sub(self.DIGITANDDOTREGEX,
                      self.DIGITANDENTITYSUB,
                      data,
                      flags=re.UNICODE)

    def _segmentIntoSentences(self, data, tokenizer_path):
        """Replace current content by sentences.

           The sentences segmentation is done using
           the french pickle of the NLTK toolkit.

           param data: an utf-8 encoded string
        """
        try:

            #Get the french tokenizer
            tokenizer = nltk.data.load(tokenizer_path)

            #The actual job
            sentences = tokenizer.tokenize(data)

        except Exception, e:
            TextDocument.logger.critical("Tokenizer error: " + str(e))
            raise Exception("Tokenizer error: " + self.tokenizer_path)

        return sentences
Ejemplo n.º 8
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.keepNewWords = False
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:
          
           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append(pattern, u"", regexType, u"0")

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:
          
           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setKeepNewWords(self, keepNewWords):
        self.keepNewWords = keepNewWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK, self.keepNewWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc
Ejemplo n.º 9
0
class DataPreparationAPI():
    """Import sentences from one file, classifying
       sentences into languages.
    """
    logger = logging.getLogger("Asrt.DataPreparationAPI")

    def __init__(self, inputFile, outputDir):
        """Default constructor.
        """
        self.inputFile = inputFile
        self.outputDir = outputDir
        self.tempDir = outputDir
        self.formattedText = None
        self.debug = False
        self.regexFile = None
        self.lmModeling = False
        self.filterSentences = False
        self.filterTextSentences2ndStage = False
        self.removePunctuation = False
        self.verbalizePunctuation = False
        self.segmentWithNLTK = True
        self.expandNumberInWords = True
        self.doc = None
        self.wordClassifier = None
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    #####################
    #Getters and setters
    #
    def setInputFile(self, inputFile):
        self.inputFile = inputFile

    def setOutputDir(self, outputDir):
        self.outputDir = outputDir

    def setTempDir(self, tempDir):
        self.tempDir = tempDir

    def setFormattedText(self, formattedText):
        self.formattedText = formattedText

    def getCleanedText(self):
        if self.doc != None:
            return self.doc.getCleanedText()
        return ""

    def getCleanedTextPerLanguage(self):
        if self.doc != None:
            return self.doc.getCleanedTextPerLanguage()
        return ""

    def setDebugMode(self, debug):
        self.debug = debug

    def setRegexFile(self, regexFile):
        self.regexFile = regexFile

    def setRegexList(self, regexList):
        """Set both validation and substitution user regexes.

           param regexList: a list of the following form:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        #Reset current lists
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

        substitutionList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))
            else:
                substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getSubstitutionList(self):
        """Get the user defined substitution list.

           return a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        return self.substitutionRegexFormula.getSubstitutionPatterns()

    def setSubstitutionList(self, regexList):
        """Set the user regexes substitution list.

           param regexList: a four columns list of lists:

           [u'matching pattern', u'substitution', u'type', u'language id']
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)

        substitutionList = []

        for row in regexList:
            substitutionList.append((row[0], row[1], row[2], row[3]))

        self.substitutionRegexFormula.setSubstitutionPatternList(
            substitutionList)

    def getValidationList(self):
        """Get the user defined validation list.

           return a four columns list of lists:

           [u'matching pattern', u'', u'-1', u'0']
        """
        validationList = []
        for pattern, regexType in self.validationPatternList:
            validationList.append([pattern, "", regexType, "0"])

        return validationList

    def setValidationList(self, regexList):
        """Set the user regexes validation list.

           Filter 'regexList' for validation rules only.

           param regexList: a four columns list of lists:

           ['matching pattern', 'substitution', 'type', 'language id']
        """
        self.validationPatternList = []

        for row in regexList:
            if int(row[2]) == VALIDATION_TYPE:
                self.validationPatternList.append((row[0], row[3]))

    def setLMModeling(self, modelNgram):
        self.lmModeling = modelNgram

    def setFilterSentences(self, filterSentences):
        self.filterSentences = filterSentences

    def setFilterSentences2ndStage(self, filterTextSentences2ndStage):
        self.filterTextSentences2ndStage = filterTextSentences2ndStage

    def setRemovePunctuation(self, removePunctuation):
        self.removePunctuation = removePunctuation

    def setVerbalizePunctuation(self, verbalizePunctuation):
        self.verbalizePunctuation = verbalizePunctuation

    def setSegmentWithNLTK(self, segmentWithNLTK):
        self.segmentWithNLTK = segmentWithNLTK

    def setExpandNumberInWords(self, expandNumberInWords):
        self.expandNumberInWords = expandNumberInWords

    def getDocument(self):
        """Get the underlying 'TextDocument'.
        """
        return self.doc

    #####################
    #Public interface
    #
    def trainClassifier(self):
        """Train the underlying classifier.
        """
        if self.wordClassifier == None:
            self.logger.info("Prepare the word classifier ...")
            self.wordClassifier = WordClassifier()
            self.wordClassifier.train()

    def getRegexes(self):
        """Fetch validation and substitution regexes
           from csv file.
        """
        #User did not specified rules
        if self.regexFile == None:
            return

        #Are regexes already loaded in API
        if self.substitutionRegexFormula.hasPatterns() or \
            len(self.validationPatternList) > 0:
            return

        regexList = RegexList().loadFromFile(self.regexFile)
        self.setRegexList(regexList)

    def resetAllPatterns(self):
        """Empty all validation and substitution regexes.
        """
        self.substitutionRegexFormula = RegularExpressionFormula(None)
        self.validationPatternList = []

    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[:]))
            # str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK,
                                    self.expandNumberInWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

            if self.filterTextSentences2ndStage:
                if language == GERMAN:
                    self.logger.info(
                        "Filtering data - 2nd stage (remove web address and check German orthograph)"
                    )
                    self.doc.filterTextSentences2ndStage()

        except Exception as e:
            errorMessage = "An error has occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)

        return self.doc

    def outputSentencesToFiles(self, outputDir):
        """Output the original sentences with language
           information to the 'outputFile'
        """
        self.logger.info("Output results to language files.")

        sentencesDict = {
            FRENCH_LABEL: [],
            GERMAN_LABEL: [],
            ITALIAN_LABEL: [],
            ENGLISH_LABEL: [],
            UNKNOWN_LABEL: []
        }

        self.appendDocumentSentences(self.doc, sentencesDict)
        self.outputPerLanguage(sentencesDict, outputDir)

    @staticmethod
    def appendDocumentSentences(textDocument, sentencesDict):
        """Update 'sentencesDict' with the 'textDocument'
           content.
        """
        #Save all sentences
        for textCluster in textDocument.getListContent():
            strSentence = textCluster.getTextSentence()
            currentLanguage = UNKNOWN_LABEL

            if textCluster.isFrench():
                currentLanguage = FRENCH_LABEL
            elif textCluster.isGerman():
                currentLanguage = GERMAN_LABEL
            elif textCluster.isItalian():
                currentLanguage = ITALIAN_LABEL
            elif textCluster.isEnglish():
                currentLanguage = ENGLISH_LABEL

            #strOut = u"<" + textDocument.sourceFileName + u">: " + strSentence
            strOut = strSentence.rstrip()
            sentencesDict[currentLanguage].append(strOut)

    @staticmethod
    def outputPerLanguage(sentencesDict, outputDir):
        """Output sentences in language files.
        """
        io = Ioread()
        #Finally output to disk
        for resultLanguage, results in list(sentencesDict.items()):
            if len(results) > 0:
                DataPreparationAPI.logger.info("%d sentences found for: %s" %
                                               (len(results), resultLanguage))
                strContent = "\n".join(results)
                strContent = strContent.rstrip() + "\n"
                outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                      resultLanguage)
                DataPreparationAPI.logger.info("Writing content to: %s" %
                                               outputPath)
                io.writeFileContent(outputPath, strContent)
            else:
                DataPreparationAPI.logger.info("No sentences found for: %s" %
                                               resultLanguage)