def testLoadRegexes(self): api = DataPreparationAPI(None,None) api.setRegexFile(self.regexFile) api.getRegexes() self.assertTrue(api.substitutionRegexFormula.hasPatterns()) self.assertTrue(len(api.validationPatternList) > 0) self.assertTrue(len(api.substitutionRegexFormula.substitutionPatternList[0]) > 1)
def testPrepareDocumentSimple(self): api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) try: api.setFormattedText(u'"') api.prepareDocument(1) except Exception: self.fail("Should not raise an exception")
def outputSentencesToFiles(self, textDocumentsList): """Output the original sentences with language information to the database. """ sentencesDict = {FRENCH_LABEL: [], GERMAN_LABEL: [], ITALIAN_LABEL: [], ENGLISH_LABEL: [], UNKNOWN_LABEL: []} for textDocument in textDocumentsList: DataPreparationAPI.appendDocumentSentences(textDocument, sentencesDict) DataPreparationAPI.outputPerLanguage(sentencesDict, self.getTempDirectory())
def outputSentencesToFiles(self, textDocumentsList): """Output the original sentences with language information to the database. """ sentencesDict = {FRENCH_LABEL:[], GERMAN_LABEL:[], ITALIAN_LABEL:[], ENGLISH_LABEL:[], UNKNOWN_LABEL:[]} for textDocument in textDocumentsList: DataPreparationAPI.appendDocumentSentences(textDocument, sentencesDict) DataPreparationAPI.outputPerLanguage(sentencesDict, self.getTempDirectory())
def testPrepareDocumentSimple(self): api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) try: api.setFormattedText(u'"') api.prepareDocument(1) except Exception: self.fail("Should not raise an exception")
def testPrepareDocument(self): api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) for languageId, strFileName in self.testFileList: self.logger.info("Testing %s" % strFileName) testList = self.getTestList(strFileName) for test, gt, bDiscard in testList: if int(bDiscard): continue #Main call api.setFormattedText(test) api.prepareDocument(languageId) formattedText = api.getCleanedText() self.assertEquals(formattedText.encode('utf-8'), gt.encode('utf-8'), "'%s' is not '%s':%s" % (formattedText.encode('utf-8'), gt.encode('utf-8'), strFileName))
def testLoadRegexes(self): api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.getRegexes() self.assertTrue(api.substitutionRegexFormula.hasPatterns()) self.assertTrue(len(api.validationPatternList) > 0) self.assertTrue( len(api.substitutionRegexFormula.substitutionPatternList[0]) > 1)
#Parse arguments args = parser.parse_args() inputFile = args.inputFile[0] outputDir = args.outputDir[0] language = int(args.language[0]) regexFile = args.regexFile[0] #Flags debug = args.debug filterSentences = args.filter removePunctuation = args.rmpunct verbalizePunctuation = args.vbpunct lmModeling = args.lm setupLogging(logging.INFO, outputDir + "/task_log.txt") #Api setup api = DataPreparationAPI(inputFile, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) if language == 0: api.trainClassifier() #Main processing api.prepareDocument(language) api.outputSentencesToFiles(outputDir)
def testPrepareDocument(self): api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) for languageId, strFileName in self.testFileList: self.logger.info("Testing %s" % strFileName) testList = self.getTestList(strFileName) for test, gt, bDiscard in testList: if int(bDiscard): continue #Main call api.setFormattedText(test) api.prepareDocument(languageId) formattedText = api.getCleanedText() self.assertEquals( formattedText.encode('utf-8'), gt.encode('utf-8'), "'%s' is not '%s':%s" % (formattedText.encode('utf-8'), gt.encode('utf-8'), strFileName))
def testPrepareDocumentBasic(self): testString = ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -" gtString = ur"hes-so und adg/la auch im winter sommer" api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) api.setKeepNewWords(True) api.setFormattedText(testString) api.prepareDocument(2) formattedText = api.getCleanedText() self.assertEquals(gtString.encode('utf-8'), formattedText.encode('utf-8'))
def testPrepareDocumentBasic(self): testString = ur"/ HES-SO und AdG/LA - auch im Winter / Sommer -" gtString = ur"hes-so und adg/la auch im winter sommer" api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) api.setKeepNewWords(True) api.setFormattedText(testString) api.prepareDocument(2) formattedText = api.getCleanedText() self.assertEquals(gtString.encode('utf-8'), formattedText.encode('utf-8'))
def doWork(self): """The actual upload of sentences. """ self._log(logging.INFO, "Do work!") if len(self.mapLists) > 1: self._log(logging.CRITICAL, "Only one map list accepted!") documentUrl = None try: # All pdf documents textDocumentsList = [] dictMap = self.mapLists[0].getDictionaryMap() totalCount = len(dictMap.keys()) count = 0 self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory()) self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory()) self._log(logging.INFO, "%d files to process!" % totalCount) # Setup once for all documents api = DataPreparationAPI(None, self.getOutputDirectory()) if self.regexFile != None and len(self.regexFile) > 0: api.setRegexFile(self.regexFile) api.setFilterSentences(self.textFiltering) api.setDebugMode(self.debug) api.setRemovePunctuation(self.removePunctuation) api.setVerbalizePunctuation(self.verbalizePunctuation) api.setLMModeling(self.lmModeling) # Loop trough map file for documentName in dictMap.keys(): for language in dictMap[documentName]: documentUrl = self.inputList.getPath(documentName) # Set the current document information api.setInputFile(documentUrl) # Main processing api.prepareDocument(LANGUAGE2ID[language]) textDocumentsList.append(api.getDocument()) count += 1 self._log(logging.INFO, "%d remaining files to process!" % (totalCount - count)) self._log(logging.INFO, "Output results to language files.") self.outputSentencesToFiles(textDocumentsList) # Outcome of the work to be saved self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile()) except Exception, e: errorMessage = "An error as occurred when importing sentences from %s" % documentUrl self._log(logging.CRITICAL, getErrorMessage(e, errorMessage)) raise e
#Parse arguments args = parser.parse_args() inputFile = args.inputFile[0] outputDir = args.outputDir[0] language = int(args.language[0]) regexFile = args.regexFile[0] #Flags debug = args.debug filterSentences = args.filter removePunctuation = args.rmpunct verbalizePunctuation = args.vbpunct lmModeling = args.lm setupLogging(logging.INFO, outputDir + "/task_log.txt") #Api setup api = DataPreparationAPI(inputFile, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) if language == 0: api.trainClassifier() #Main processing api.prepareDocument(language) api.outputSentencesToFiles(outputDir)
def doWork(self): """The actual upload of sentences. """ self._log(logging.INFO, "Do work!") if len(self.mapLists) > 1: self._log(logging.CRITICAL,"Only one map list accepted!") documentUrl = None try: #All pdf documents textDocumentsList = [] dictMap = self.mapLists[0].getDictionaryMap() totalCount = len(dictMap.keys()) count = 0 self._log(logging.INFO, "Temp dir is: %s" % self.getTempDirectory()) self._log(logging.INFO, "Output dir is: %s" % self.getOutputDirectory()) self._log(logging.INFO, "%d files to process!" % totalCount) #Setup once for all documents api = DataPreparationAPI(None, self.getOutputDirectory()) if self.regexFile != None and len(self.regexFile) > 0: api.setRegexFile(self.regexFile) api.setFilterSentences(self.textFiltering) api.setDebugMode(self.debug) api.setRemovePunctuation(self.removePunctuation) api.setVerbalizePunctuation(self.verbalizePunctuation) api.setSegmentWithNLTK(self.segmentWithNLTK) api.setLMModeling(self.lmModeling) api.trainClassifier() #Loop trough map file for documentName in dictMap.keys(): for language in dictMap[documentName]: documentUrl = self.inputList.getPath(documentName) #Set the current document information api.setInputFile(documentUrl) #Main processing api.prepareDocument(LANGUAGE2ID[language]) textDocumentsList.append(api.getDocument()) count += 1 self._log(logging.INFO, "%d remaining files to process!" % (totalCount-count)) self._log(logging.INFO, "Output results to language files.") self.outputSentencesToFiles(textDocumentsList) #Outcome of the work to be saved self.setResult(False, "Success importing sentences from %s" % self.mapLists[0].getDataMapFile()) except Exception, e: errorMessage = "An error as occurred when importing sentences from %s" % documentUrl self._log(logging.CRITICAL, getErrorMessage(e, errorMessage)) raise e
regexFile = args.regexFile[0] # Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool(args.filter2ndStage) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) expandNumberInWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") # Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setExpandNumberInWords(expandNumberInWords) if language == 0: api.trainClassifier() # Main processing MyFile.checkDirExists(outputDir)
regexFile = args.regexFile[0] #Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool( args.filter2ndStage ) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) keepNewWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") #Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier() #Main processing MyFile.checkDirExists(outputDir)
regexFile = args.regexFile[0] #Flags debug = bool(args.debug) filterSentences = bool(args.filter) filterSentences2ndStage = bool( args.filter2ndStage ) removePunctuation = bool(args.rmpunct) verbalizePunctuation = bool(args.vbpunct) rawSeg = bool(args.rawseg) lmModeling = bool(args.lm) keepNewWords = bool(not args.trim) setupLogging(logging.INFO, outputDir + "/task_log.txt") #Api setup api = DataPreparationAPI(inputFile, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier() #Main processing api.prepareDocument(language) api.outputSentencesToFiles(outputDir)
def testPrepareDocumentBasic(self): testString = r"/ HES-SO und AdG/LA - auch im Winter / Sommer -" gtString = r"hes-so und adg/la auch im winter sommer" api = DataPreparationAPI(None, None) api.setRegexFile(self.regexFile) api.setLMModeling(True) api.setExpandNumberInWords(False) api.setFormattedText(testString) api.prepareDocument(2) formattedText = api.getCleanedText() self.assertEqual(gtString, formattedText)