def loadTextFile(self):
        """Load converted text file.
        """
        if self.tempFilePath is None or not MyFile.checkFileExists(self.tempFilePath):
            raise Exception("Temporary text file does not exist!")

        io = Ioread()
        self.sentencesList = io.readFileContentList(self.tempFilePath)
Esempio n. 2
0
    def loadTextFile(self):
        """Load converted text file.
        """
        if self.tempFilePath is None or not MyFile.checkFileExists(
                self.tempFilePath):
            raise Exception("Temporary text file does not exist!")

        io = Ioread()
        self.sentencesList = io.readFileContentList(self.tempFilePath)
    def text2text(sourcePath, destinationPath, logDir):
        """Make a copy of 'destinationPath'.
        """
        TextRepresentation.logger.info("Copying txt file: " + sourcePath + " into text.")
        
        io = Ioread()
        strContent = io.readFileContent(sourcePath)

        #Write utf8
        io.writeFileContent(destinationPath, strContent)
Esempio n. 4
0
    def text2text(sourcePath, destinationPath, logDir):
        """Make a copy of 'destinationPath'.
        """
        TextRepresentation.logger.info("Copying txt file: " + sourcePath +
                                       " into text.")

        io = Ioread()
        strContent = io.readFileContent(sourcePath)

        #Write utf8
        io.writeFileContent(destinationPath, strContent)
Esempio n. 5
0
    def _loadTextDocumentAsSentences(self, filePath):
        """Load a text document and segment
           it into sentences using NLTK.

           Initial new lines are first removed.
        """
        io = Ioread()

        #One string for the whole
        #text file as utf-8 string
        data = io.nltkRead(filePath)
        self._loadAsSentences(data)
Esempio n. 6
0
    def _loadTextDocumentAsSentences(self, filePath):
        """Load a text document and segment
           it into sentences using NLTK.

           Initial new lines are first removed.
        """
        io = Ioread()

        #One string for the whole
        #text file as utf-8 string
        data = io.nltkRead(filePath)
        self._loadAsSentences(data)
Esempio n. 7
0
 def countPresentFile(self, input_file):
     """Count the presence of all the punctuation in the punctuation model
         present in the file named [input_file]
             input_file              Input file to be used as source
             return                  dictionary containing key and count
     """
     try:
         io = Ioread()
         file_content = io.readFileContent(input_file)
         return self.countPresenceText(file_content)
     except Exception, e :
         print("exception<" + input_file + "> : " + str(e), file=sys.stderr)
         return {}
Esempio n. 8
0
 def countPresentFile(self, input_file):
     """Count the presence of all the punctuation in the punctuation model
         present in the file named [input_file]
             input_file              Input file to be used as source
             return                  dictionary containing key and count
     """
     try:
         io = Ioread()
         file_content = io.readFileContent(input_file)
         return self.countPresenceText(file_content)
     except Exception as e:
         print(("exception<" + input_file + "> : " + str(e)))
         return {}
Esempio n. 9
0
    def dumpAttributeContent(self, attributeName, outputFileName):
        """Write to disk the content of 'attributeName'

           return True or False depending on something 
                  was written
        """
        attributeContent = self.getAttribute(attributeName)

        if attributeContent == None:
            return False

        file = Ioread()
        file.writeFileContent(outputFileName, attributeContent)
        return True
Esempio n. 10
0
 def outputPerLanguage(sentencesDict, outputDir):
     """Output sentences in language files.
     """
     io = Ioread()
     #Finally output to disk
     for resultLanguage, results in sentencesDict.items():
         if len(results) > 0:
             DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage))
             strContent = "\n".join(results)
             strContent = strContent.rstrip() + "\n"
             outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                   resultLanguage)
             DataPreparationAPI.logger.info("Writing content to: %s" % outputPath)
             io.writeFileContent(outputPath,strContent)
         else:
             DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
Esempio n. 11
0
class AsrtSubprocess():
    """An utility class to group methods.
    """
    logger = logging.getLogger("Asrt.AsrtSubprocess")

    @staticmethod
    def execute(commandList, logPath, outFileName=None, errFileName=None):
        """Wrapper to execute a sub process.
        """
        #Make sure the directory exists
        MyFile.checkDirExists(logPath)

        stdout, stderr, retCode = None, None, 0

        try:
            #Default to one log
            p = subprocess.Popen(commandList,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT)

            if errFileName is not None:
                p = subprocess.Popen(commandList,
                                     stdout=subprocess.PIPE,
                                     stderr=subprocess.PIPE)

            #Run the subprocess
            stdout, stderr = p.communicate()
            retCode = p.poll()
        except Exception, e:
            AsrtSubprocess.logger.critical("Subprocess error: %s" % str(e))
            errorMessage = str(commandList) + "\n" + \
                           "------------ Begin stack ------------\n" + \
                           traceback.format_exc().rstrip() + "\n" + \
                           "------------ End stack --------------"
            print errorMessage

            #Make sure the trace is logged
            if stderr is None:
                stderr = errorMessage
            else:
                stderr += errorMessage

            retCode = 1

        #Now log results
        #It is important to be ouside exception management as we
        #still want to log what happened
        io = Ioread()

        if stdout != None and len(stdout) > 0 and outFileName != None:
            io.writeFileContent("%s/%s" % (logPath, outFileName),
                                unicode(stdout, 'utf-8'))

        if stderr != None and len(stderr) > 0 and errFileName != None:
            io.writeFileContent("%s/%s" % (logPath, errFileName),
                                unicode(stderr, 'utf-8'))

        return retCode, stdout, stderr
Esempio n. 12
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))

        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Esempio n. 13
0
 def outputPerLanguage(sentencesDict, outputDir):
     """Output sentences in language files.
     """
     io = Ioread()
     #Finally output to disk
     for resultLanguage, results in sentencesDict.items():
         if len(results) > 0:
             DataPreparationAPI.logger.info("%d sentences found for: %s" %
                                            (len(results), resultLanguage))
             strContent = "\n".join(results)
             strContent = strContent.rstrip() + "\n"
             outputPath = "%s/sentences_%s.txt" % (outputDir,\
                                                   resultLanguage)
             DataPreparationAPI.logger.info("Writing content to: %s" %
                                            outputPath)
             io.writeFileContent(outputPath, strContent)
         else:
             DataPreparationAPI.logger.info("No sentences found for: %s" %
                                            resultLanguage)
Esempio n. 14
0
class TestIoread(unittest.TestCase):
    logger = logging.getLogger("Asrt.TestIoread")
    testFile = scriptsDir + "/resources/ioread_utf8.txt"
    testFileCSV = scriptsDir + "/resources/ioread_utf8.csv"

    testsString = [
        """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""",
        """Non latin characters 镕"""
    ]

    testList = [[
        'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕'
    ]]

    def setUp(self):
        self.ioread = Ioread()

    ############
    # Tests
    #
    def testOpenFile(self):
        try:
            fd = self.ioread.openFile(self.testFile)
            self.ioread.closeFile(fd)
        except Exception:
            self.fail("testOpenFile raised ExceptionType unexpectedled")

    def testReadFileContent(self):
        strContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(self.testsString[0], strContent)

    def testReadFileContentList(self):
        strContentList = self.ioread.readFileContentList(self.testFile)
        self.assertEquals(3, len(strContentList))
        self.assertEquals(self.testsString[1], strContentList[2])

    def testReadCSV(self):
        strContentList = self.ioread.readCSV(self.testFileCSV)
        self.assertEquals(1, len(strContentList))
        self.assertEquals(strContentList, self.testList)

    def testWriteFileContent(self):
        strContent = self.testsString[0]
        self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent)

        readStrContent = self.ioread.readFileContent(self.testFile)
        self.assertEquals(strContent, readStrContent)
Esempio n. 15
0
def applyRegexes(inputFile, outputFile, regularFile):
    """Apply the regular expressions contained in 'regularFile'.

       params: - inputFile   : a text file in 'utf-8' encoding
               - outputFile  : the result text file in 'utf-8' encoding
               - regularFile : the file containing the regular expressions
                               to apply.
    """
    regexFormula = RegularExpressionFormula(rulesFile=regularFile)

    io = Ioread()
    fd = io.openFile(inputFile)

    count, linesList = 0, []

    #Read first line
    l = fd.readline()

    while l != "":
        l = l.rstrip().strip()

        #Remove punctuation using regular expressions
        linesList.append(regexFormula.apply(l, FRENCH))
        
        count += 1
        if count % 50000 == 0:
            print "Processed %d values" % count

        #Read next line
        l = fd.readline()

    io.closeFile(fd)

    strContent = u"\n".join(linesList)
    io.writeFileContent(outputFile, strContent)
Esempio n. 16
0
    def loadFromFile(regexFile):
        """Load regular expressions from a csv file.

           The file is assumed to be in CSV file format
           with tabs as fields separators and no quotes
           around fields.

           File format is:
                matching pattern, substitution pattern, 
                    regex type (substitution = 0, deletion = 1), comments

           param regexFile: a csv file
           return a compiled regular expression list with their
                  matching patterns 
        """
        RegexList.logger.info("Load regular expression from %s" % regexFile)

        io = Ioread()
        regexList = io.readCSV(regexFile, '\t')
        substitutionPatternList = RegexList.removeComments(regexList[1:])
        RegexList.logger.info("Done loading regular expressions")

        return substitutionPatternList
    def loadFromFile(regexFile):
        """Load regular expressions from a csv file.

           The file is assumed to be in CSV file format
           with tabs as fields separators and no quotes
           around fields.

           File format is:
                matching pattern, substitution pattern, 
                    regex type (substitution = 0, deletion = 1), comments

           param regexFile: a csv file
           return a compiled regular expression list with their
                  matching patterns 
        """
        RegexList.logger.info("Load regular expression from %s" % regexFile)
        
        io = Ioread()
        regexList = io.readCSV(regexFile,'\t')
        substitutionPatternList = RegexList.removeComments(regexList[1:])
        RegexList.logger.info("Done loading regular expressions")

        return substitutionPatternList
Esempio n. 18
0
 def replaceFile(self, input_file, output_file = None):
     """Replace the punctuation in the file named [input_file] and return the
         result in the file named [output_file]
             input_file              Input file to be used as source
             output_file             Output file (if None return Text
             return                  True if success False is failure in case
                                         output_file is present output_text if
                                         not present
     """
     try:
         io = Ioread()
         input_text = io.readFileContent(input_file)
         if output_file :
             f = open(output_file, "w")
             f.write(self.replaceText(input_text))
             return True
         else:
             return self.replaceText(input_text)
     except Exception, e :
         print("exception<" + input_file + "> : " + str(e), file=sys.stderr)
         if output_file :
             return False
         else :
             return None
Esempio n. 19
0
 def replaceFile(self, input_file, output_file=None):
     """Replace the punctuation in the file named [input_file] and return the
         result in the file named [output_file]
             input_file              Input file to be used as source
             output_file             Output file (if None return Text
             return                  True if success False is failure in case
                                         output_file is present output_text if
                                         not present
     """
     try:
         io = Ioread()
         input_text = io.readFileContent(input_file)
         if output_file:
             f = open(output_file, "w")
             f.write(self.replaceText(input_text))
             return True
         else:
             return self.replaceText(input_text)
     except Exception as e:
         print(("exception<" + input_file + "> : " + str(e)))
         if output_file:
             return False
         else:
             return None
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    # Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setExpandNumberInWords(expandNumberInWords)

    if language == 0:
        api.trainClassifier()

    # Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir,
                                    os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + "\n")
Esempio n. 21
0
 def setUp(self):
     self.ioread = Ioread()
    setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt")

    #Api setup
    api = DataPreparationAPI(None, outputDir)
    api.setRegexFile(regexFile)
    api.setFilterSentences(filterSentences)
    api.setFilterSentences2ndStage(filterSentences2ndStage)
    api.setLMModeling(lmModeling)
    api.setRemovePunctuation(removePunctuation)
    api.setVerbalizePunctuation(verbalizePunctuation)
    api.setSegmentWithNLTK(not rawSeg)
    api.setKeepNewWords(keepNewWords)

    if language == 0:
        api.trainClassifier()

    #Main processing
    MyFile.checkDirExists(outputDir)

    io = Ioread()
    inputList = io.readFileContentList(inputList)

    for i, f in enumerate(inputList):
        api.setInputFile(f)
        api.prepareDocument(language)
        strUnformatted = api.getCleanedText()

        outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0])
        io.writeFileContent(outputFile, strUnformatted + u"\n")
 def getTestList(self, strFileName):
     """Get CSV content of 'strFileName'.
     """
     io = Ioread()
     return io.readCSV(strFileName, delim='\t')
 def getTestList(self, strFileName):
     """Get CSV content of 'strFileName'.
     """
     io = Ioread()
     return io.readCSV(strFileName, delim='\t')