Exemple #1
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
    def text2text(sourcePath, destinationPath, logDir):
        """Make a copy of 'destinationPath'.
        """
        TextRepresentation.logger.info("Copying txt file: " + sourcePath + " into text.")
        
        io = Ioread()
        strContent = io.readFileContent(sourcePath)

        #Write utf8
        io.writeFileContent(destinationPath, strContent)
    def text2text(sourcePath, destinationPath, logDir):
        """Make a copy of 'destinationPath'.
        """
        TextRepresentation.logger.info("Copying txt file: " + sourcePath +
                                       " into text.")

        io = Ioread()
        strContent = io.readFileContent(sourcePath)

        #Write utf8
        io.writeFileContent(destinationPath, strContent)
Exemple #5
0
    def execute(commandList, logPath, outFileName=None, errFileName=None):
        """Wrapper to execute a sub process.
        """
        #Make sure the directory exists
        MyFile.checkDirExists(logPath)

        stdout, stderr, retCode = None, None, 0

        try:
            #Default to one log
            p = subprocess.Popen(commandList,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)

            if errFileName is not None:
                p = subprocess.Popen(commandList,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

            #Run the subprocess
            stdout, stderr = p.communicate()
            retCode = p.poll()
        except Exception as e:
            AsrtSubprocess.logger.critical("Subprocess error: %s" % str(e))
            errorMessage = str(commandList) + "\n" + \
                           "------------ Begin stack ------------\n" + \
                           traceback.format_exc().rstrip() + "\n" + \
                           "------------ End stack --------------"
            print(errorMessage)

            #Make sure the trace is logged
            if stderr is None:
                stderr = errorMessage
            else:
                stderr += errorMessage

            retCode = 1

        #Now log results
        #It is important to be ouside exception management as we
        #still want to log what happened
        io = Ioread()

        if stdout != None and len(stdout) > 0 and outFileName != None:
            io.writeFileContent("%s/%s" % (logPath, outFileName),
                                str(stdout, 'utf-8'))

        if stderr != None and len(stderr) > 0 and errFileName != None:
            io.writeFileContent("%s/%s" % (logPath, errFileName),
                                str(stderr, 'utf-8'))

        return retCode, stdout, stderr
Exemple #6
0
    def dumpAttributeContent(self, attributeName, outputFileName):
        """Write to disk the content of 'attributeName'

           return True or False depending on something 
                  was written
        """
        attributeContent = self.getAttribute(attributeName)

        if attributeContent == None:
            return False

        file = Ioread()
        file.writeFileContent(outputFileName, attributeContent)
        return True
 def outputPerLanguage(sentencesDict, outputDir):
     """Output sentences in language files.
     """
     io = Ioread()
     #Finally output to disk
     for resultLanguage, results in sentencesDict.items():
         if len(results) > 0:
             DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage))
             strContent = "\n".join(results)
             strContent = strContent.rstrip() + "\n"
             outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                   resultLanguage)
             DataPreparationAPI.logger.info("Writing content to: %s" % outputPath)
             io.writeFileContent(outputPath,strContent)
         else:
             DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
Exemple #8
0
class TestIoread(unittest.TestCase):
    logger = logging.getLogger("Asrt.TestIoread")
    testFile = scriptsDir + "/resources/ioread_utf8.txt"
    testFileCSV = scriptsDir + "/resources/ioread_utf8.csv"

    testsString = [
        """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""",
        """Non latin characters 镕"""
    ]

    testList = [[
        'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕'
    ]]

    def setUp(self):
        self.ioread = Ioread()

    ############
    # Tests
    #
    def testOpenFile(self):
        try:
            fd = self.ioread.openFile(self.testFile)
            self.ioread.closeFile(fd)
        except Exception:
            self.fail("testOpenFile raised ExceptionType unexpectedled")

    def testReadFileContent(self):
        strContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(self.testsString[0], strContent)

    def testReadFileContentList(self):
        strContentList = self.ioread.readFileContentList(self.testFile)
        self.assertEquals(3, len(strContentList))
        self.assertEquals(self.testsString[1], strContentList[2])

    def testReadCSV(self):
        strContentList = self.ioread.readCSV(self.testFileCSV)
        self.assertEquals(1, len(strContentList))
        self.assertEquals(strContentList, self.testList)

    def testWriteFileContent(self):
        strContent = self.testsString[0]
        self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent)

        readStrContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(strContent, readStrContent)
 def outputPerLanguage(sentencesDict, outputDir):
     """Output sentences in language files.
     """
     io = Ioread()
     #Finally output to disk
     for resultLanguage, results in sentencesDict.items():
         if len(results) > 0:
             DataPreparationAPI.logger.info("%d sentences found for: %s" %
                                            (len(results), resultLanguage))
             strContent = "\n".join(results)
             strContent = strContent.rstrip() + "\n"
             outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                   resultLanguage)
             DataPreparationAPI.logger.info("Writing content to: %s" %
                                            outputPath)
             io.writeFileContent(outputPath, strContent)
         else:
             DataPreparationAPI.logger.info("No sentences found for: %s" %
                                            resultLanguage)
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    #Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()

    #Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + u"\n")
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    # Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setExpandNumberInWords(expandNumberInWords)

    if language == 0:
        api.trainClassifier()

    # Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir,
                                    os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + "\n")