예제 #1
0
    def execute(self):
        """Execute the task. A validation of the
           parameter is first done.
        """
        try:
            #Build from taskInfo
            self._buildParametersDictionary()

            #Call child implementation
            self.validateParameters()
            self.setParameters()

            #Copy necessary data into task folder
            self.gatherInputData()

            #Actual work
            self.doWork()

            #Output data
            self.prepareOutputData()

        except Exception as e:
            errorMessage = "An error has occured"
            self._log(logging.CRITICAL, getErrorMessage(e, errorMessage))
            self.setResult(True, errorMessage)
예제 #2
0
    def doWork(self):
        """The actual upload of sentences.
        """
        self._log(logging.INFO, "Do work!")

        if len(self.mapLists) > 1:
            self._log(logging.CRITICAL,"Only one map list accepted!")

        documentUrl = None

        try:
            #All pdf documents
            textDocumentsList = []
            dictMap = self.mapLists[0].getDictionaryMap()

            totalCount = len(dictMap.keys())
            count = 0

            self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory())
            self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory())
            self._log(logging.INFO, "%d files to process!" % totalCount)

            #Setup once for all documents
            api = DataPreparationAPI(None, self.getOutputDirectory())
            if self.regexFile != None and len(self.regexFile) > 0:
                api.setRegexFile(self.regexFile)

            api.setFilterSentences(self.textFiltering)
            api.setDebugMode(self.debug)
            api.setRemovePunctuation(self.removePunctuation)
            api.setVerbalizePunctuation(self.verbalizePunctuation)
            api.setSegmentWithNLTK(self.segmentWithNLTK)
            api.setLMModeling(self.lmModeling)
            api.trainClassifier()

            #Loop trough map file
            for documentName in dictMap.keys():
                for language in dictMap[documentName]:
                    documentUrl = self.inputList.getPath(documentName)

                    #Set the current document information
                    api.setInputFile(documentUrl)
                   
                    #Main processing
                    api.prepareDocument(LANGUAGE2ID[language])
                    textDocumentsList.append(api.getDocument())

                count += 1
                self._log(logging.INFO, "%d remaining files to process!" % (totalCount-count))

            self._log(logging.INFO, "Output results to language files.")
            self.outputSentencesToFiles(textDocumentsList)

            #Outcome of the work to be saved
            self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile())

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences from %s" % documentUrl
            self._log(logging.CRITICAL, getErrorMessage(e, errorMessage))
            raise e
예제 #3
0
    def prepareDocument(self, language = 0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language> 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList,
                                    self.outputDir)
            
            if self.inputFile != None:
                self.logger.info("Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info("Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()
            
            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)
            
            self.logger.critical(errorMessage)

            raise Exception(e)
예제 #4
0
    def prepareDocument(self, language=0):
        """Segment the document into sentences and prepare them.

           param language: an int between 0-4
                - unknown : 0
                - french  : 1
                - german  : 2
                - english : 3
                - italian : 4
        """
        if language > 4 or language < 0:
            raise Exception("Unknown language")

        #Done at the API level to share resources between
        #documents
        self.logger.info("Getting regexes")
        self.getRegexes()

        if self.substitutionRegexFormula.hasPatterns():
            self.logger.info("Using following regexes substitution:\n" +\
                    str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3]))

        if len(self.validationPatternList) > 0:
            self.logger.info("Using following regexes for sentence validation:\n" +\
                    str(self.validationPatternList[0:3]))

        try:
            self.logger.info("Document file: %s" % self.inputFile)

            #The main document
            self.doc = TextDocument(self.inputFile, language,
                                    self.substitutionRegexFormula,
                                    self.validationPatternList, self.outputDir,
                                    self.segmentWithNLTK, self.keepNewWords)

            if self.inputFile != None:
                self.logger.info(
                    "Load file, convert to text when pdf document")
                self.doc.loadDocumentAsSentences(self.tempDir)
            elif self.formattedText != None:
                self.logger.info("Load text string as sentences")
                self.doc.loadAsSentences(self.formattedText)
            else:
                raise Exception("No input file or text string provided!")

            #print self.doc.getCleanedText()

            #Control character and strip
            self.logger.info("Cleaning control characters")
            self.doc.cleanTextSentences()

            #print self.doc.getCleanedText()

            if language == 0:
                self.logger.info("Classifying sentences")
                self.doc.setClassifier(self.wordClassifier)
                self.doc.classifySentences()
            else:
                self.doc.setSentencesLanguage(language)

            #print self.doc.getCleanedText()

            #User's supplied regular expression
            if self.substitutionRegexFormula.hasPatterns():
                self.logger.info(
                    "Applying user regular expressions per language")
                self.doc.normalizeTextSentences()

            #print self.doc.getCleanedText()

            if self.filterSentences:
                self.logger.info("Filtering data")
                self.doc.filterTextSentences()

            #If LM option is selected, it will be done at
            #the prepareLM stage
            if self.removePunctuation and not self.lmModeling:
                self.doc.removeTextPunctuation()

            if self.verbalizePunctuation and not self.removePunctuation:
                self.doc.verbalizeTextPunctuation()

            #print self.doc.getCleanedText()

            #After language id has been set as it depends of
            #languages (i.e. numbers expansion)
            if self.lmModeling:
                self.logger.info("Preparing for language modeling")
                self.doc.prepareLM()

        except Exception, e:
            errorMessage = "An error as occurred when importing sentences: %s\n%s" % \
                             (getByteString(e.message), self.inputFile)
            errorMessage = getErrorMessage(e, errorMessage)

            self.logger.critical(errorMessage)

            raise Exception(e)