def execute(self): """Execute the task. A validation of the parameter is first done. """ try: #Build from taskInfo self._buildParametersDictionary() #Call child implementation self.validateParameters() self.setParameters() #Copy necessary data into task folder self.gatherInputData() #Actual work self.doWork() #Output data self.prepareOutputData() except Exception as e: errorMessage = "An error has occured" self._log(logging.CRITICAL, getErrorMessage(e, errorMessage)) self.setResult(True, errorMessage)
def doWork(self): """The actual upload of sentences. """ self._log(logging.INFO, "Do work!") if len(self.mapLists) > 1: self._log(logging.CRITICAL,"Only one map list accepted!") documentUrl = None try: #All pdf documents textDocumentsList = [] dictMap = self.mapLists[0].getDictionaryMap() totalCount = len(dictMap.keys()) count = 0 self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory()) self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory()) self._log(logging.INFO, "%d files to process!" % totalCount) #Setup once for all documents api = DataPreparationAPI(None, self.getOutputDirectory()) if self.regexFile != None and len(self.regexFile) > 0: api.setRegexFile(self.regexFile) api.setFilterSentences(self.textFiltering) api.setDebugMode(self.debug) api.setRemovePunctuation(self.removePunctuation) api.setVerbalizePunctuation(self.verbalizePunctuation) api.setSegmentWithNLTK(self.segmentWithNLTK) api.setLMModeling(self.lmModeling) api.trainClassifier() #Loop trough map file for documentName in dictMap.keys(): for language in dictMap[documentName]: documentUrl = self.inputList.getPath(documentName) #Set the current document information api.setInputFile(documentUrl) #Main processing api.prepareDocument(LANGUAGE2ID[language]) textDocumentsList.append(api.getDocument()) count += 1 self._log(logging.INFO, "%d remaining files to process!" % (totalCount-count)) self._log(logging.INFO, "Output results to language files.") self.outputSentencesToFiles(textDocumentsList) #Outcome of the work to be saved self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile()) except Exception, e: errorMessage = "An error as occurred when importing sentences from %s" % documentUrl self._log(logging.CRITICAL, getErrorMessage(e, errorMessage)) raise e
def prepareDocument(self, language = 0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language> 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir) if self.inputFile != None: self.logger.info("Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e)
def prepareDocument(self, language=0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language > 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir, self.segmentWithNLTK, self.keepNewWords) if self.inputFile != None: self.logger.info( "Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info( "Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % \ (getByteString(e.message), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e)