def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
def text2text(sourcePath, destinationPath, logDir): """Make a copy of 'destinationPath'. """ TextRepresentation.logger.info("Copying txt file: " + sourcePath + " into text.") io = Ioread() strContent = io.readFileContent(sourcePath) #Write utf8 io.writeFileContent(destinationPath, strContent)
def execute(commandList, logPath, outFileName=None, errFileName=None): """Wrapper to execute a sub process. """ #Make sure the directory exists MyFile.checkDirExists(logPath) stdout, stderr, retCode = None, None, 0 try: #Default to one log p = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if errFileName is not None: p = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #Run the subprocess stdout, stderr = p.communicate() retCode = p.poll() except Exception as e: AsrtSubprocess.logger.critical("Subprocess error: %s" % str(e)) errorMessage = str(commandList) + "\n" + \ "------------ Begin stack ------------\n" + \ traceback.format_exc().rstrip() + "\n" + \ "------------ End stack --------------" print(errorMessage) #Make sure the trace is logged if stderr is None: stderr = errorMessage else: stderr += errorMessage retCode = 1 #Now log results #It is important to be ouside exception management as we #still want to log what happened io = Ioread() if stdout != None and len(stdout) > 0 and outFileName != None: io.writeFileContent("%s/%s" % (logPath, outFileName), str(stdout, 'utf-8')) if stderr != None and len(stderr) > 0 and errFileName != None: io.writeFileContent("%s/%s" % (logPath, errFileName), str(stderr, 'utf-8')) return retCode, stdout, stderr
def dumpAttributeContent(self, attributeName, outputFileName): """Write to disk the content of 'attributeName' return True or False depending on something was written """ attributeContent = self.getAttribute(attributeName) if attributeContent == None: return False file = Ioread() file.writeFileContent(outputFileName, attributeContent) return True
def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in sentencesDict.items(): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath,strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
class TestIoread(unittest.TestCase): logger = logging.getLogger("Asrt.TestIoread") testFile = scriptsDir + "/resources/ioread_utf8.txt" testFileCSV = scriptsDir + "/resources/ioread_utf8.csv" testsString = [ """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""", """Non latin characters 镕""" ] testList = [[ 'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕' ]] def setUp(self): self.ioread = Ioread() ############ # Tests # def testOpenFile(self): try: fd = self.ioread.openFile(self.testFile) self.ioread.closeFile(fd) except Exception: self.fail("testOpenFile raised ExceptionType unexpectedled") def testReadFileContent(self): strContent = self.ioread.readFileContent(self.testFile) self.assertEquals(self.testsString[0], strContent) def testReadFileContentList(self): strContentList = self.ioread.readFileContentList(self.testFile) self.assertEquals(3, len(strContentList)) self.assertEquals(self.testsString[1], strContentList[2]) def testReadCSV(self): strContentList = self.ioread.readCSV(self.testFileCSV) self.assertEquals(1, len(strContentList)) self.assertEquals(strContentList, self.testList) def testWriteFileContent(self): strContent = self.testsString[0] self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent) readStrContent = self.ioread.readFileContent(self.testFile) self.assertEquals(strContent, readStrContent)
def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in sentencesDict.items(): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath, strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") #Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier() #Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + u"\n")
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") # Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setExpandNumberInWords(expandNumberInWords) if language == 0: api.trainClassifier() # Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + "\n")