def loadTextFile(self): """Load converted text file. """ if self.tempFilePath is None or not MyFile.checkFileExists(self.tempFilePath): raise Exception("Temporary text file does not exist!") io = Ioread() self.sentencesList = io.readFileContentList(self.tempFilePath)
def loadTextFile(self): """Load converted text file. """ if self.tempFilePath is None or not MyFile.checkFileExists( self.tempFilePath): raise Exception("Temporary text file does not exist!") io = Ioread() self.sentencesList = io.readFileContentList(self.tempFilePath)
class TestIoread(unittest.TestCase): logger = logging.getLogger("Asrt.TestIoread") testFile = scriptsDir + "/resources/ioread_utf8.txt" testFileCSV = scriptsDir + "/resources/ioread_utf8.csv" testsString = [ """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""", """Non latin characters 镕""" ] testList = [[ 'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕' ]] def setUp(self): self.ioread = Ioread() ############ # Tests # def testOpenFile(self): try: fd = self.ioread.openFile(self.testFile) self.ioread.closeFile(fd) except Exception: self.fail("testOpenFile raised ExceptionType unexpectedled") def testReadFileContent(self): strContent = self.ioread.readFileContent(self.testFile) self.assertEquals(self.testsString[0], strContent) def testReadFileContentList(self): strContentList = self.ioread.readFileContentList(self.testFile) self.assertEquals(3, len(strContentList)) self.assertEquals(self.testsString[1], strContentList[2]) def testReadCSV(self): strContentList = self.ioread.readCSV(self.testFileCSV) self.assertEquals(1, len(strContentList)) self.assertEquals(strContentList, self.testList) def testWriteFileContent(self): strContent = self.testsString[0] self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent) readStrContent = self.ioread.readFileContent(self.testFile) self.assertEquals(strContent, readStrContent)
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") #Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier() #Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + u"\n")
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") # Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setExpandNumberInWords(expandNumberInWords) if language == 0: api.trainClassifier() # Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + "\n")