def loadTextFile(self):
        """Load converted text file.
        """
        if self.tempFilePath is None or not MyFile.checkFileExists(self.tempFilePath):
            raise Exception("Temporary text file does not exist!")

        io = Ioread()
        self.sentencesList = io.readFileContentList(self.tempFilePath)
Example #2
0
    def loadTextFile(self):
        """Load converted text file.
        """
        if self.tempFilePath is None or not MyFile.checkFileExists(
                self.tempFilePath):
            raise Exception("Temporary text file does not exist!")

        io = Ioread()
        self.sentencesList = io.readFileContentList(self.tempFilePath)
Example #3
0
class TestIoread(unittest.TestCase):
    logger = logging.getLogger("Asrt.TestIoread")
    testFile = scriptsDir + "/resources/ioread_utf8.txt"
    testFileCSV = scriptsDir + "/resources/ioread_utf8.csv"

    testsString = [
        """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""",
        """Non latin characters 镕"""
    ]

    testList = [[
        'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕'
    ]]

    def setUp(self):
        self.ioread = Ioread()

    ############
    # Tests
    #
    def testOpenFile(self):
        try:
            fd = self.ioread.openFile(self.testFile)
            self.ioread.closeFile(fd)
        except Exception:
            self.fail("testOpenFile raised ExceptionType unexpectedled")

    def testReadFileContent(self):
        strContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(self.testsString[0], strContent)

    def testReadFileContentList(self):
        strContentList = self.ioread.readFileContentList(self.testFile)
        self.assertEquals(3, len(strContentList))
        self.assertEquals(self.testsString[1], strContentList[2])

    def testReadCSV(self):
        strContentList = self.ioread.readCSV(self.testFileCSV)
        self.assertEquals(1, len(strContentList))
        self.assertEquals(strContentList, self.testList)

    def testWriteFileContent(self):
        strContent = self.testsString[0]
        self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent)

        readStrContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(strContent, readStrContent)
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    #Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()

    #Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + u"\n")
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    # Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setExpandNumberInWords(expandNumberInWords)

    if language == 0:
        api.trainClassifier()

    # Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir,
                                    os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + "\n")