def loadTextFile(self): """Load converted text file. """ if self.tempFilePath is None or not MyFile.checkFileExists(self.tempFilePath): raise Exception("Temporary text file does not exist!") io = Ioread() self.sentencesList = io.readFileContentList(self.tempFilePath)
def loadTextFile(self): """Load converted text file. """ if self.tempFilePath is None or not MyFile.checkFileExists( self.tempFilePath): raise Exception("Temporary text file does not exist!") io = Ioread() self.sentencesList = io.readFileContentList(self.tempFilePath)
def text2text(sourcePath, destinationPath, logDir): """Make a copy of 'destinationPath'. """ TextRepresentation.logger.info("Copying txt file: " + sourcePath + " into text.") io = Ioread() strContent = io.readFileContent(sourcePath) #Write utf8 io.writeFileContent(destinationPath, strContent)
def _loadTextDocumentAsSentences(self, filePath): """Load a text document and segment it into sentences using NLTK. Initial new lines are first removed. """ io = Ioread() #One string for the whole #text file as utf-8 string data = io.nltkRead(filePath) self._loadAsSentences(data)
def countPresentFile(self, input_file): """Count the presence of all the punctuation in the punctuation model present in the file named [input_file] input_file Input file to be used as source return dictionary containing key and count """ try: io = Ioread() file_content = io.readFileContent(input_file) return self.countPresenceText(file_content) except Exception, e : print("exception<" + input_file + "> : " + str(e), file=sys.stderr) return {}
def countPresentFile(self, input_file): """Count the presence of all the punctuation in the punctuation model present in the file named [input_file] input_file Input file to be used as source return dictionary containing key and count """ try: io = Ioread() file_content = io.readFileContent(input_file) return self.countPresenceText(file_content) except Exception as e: print(("exception<" + input_file + "> : " + str(e))) return {}
def dumpAttributeContent(self, attributeName, outputFileName): """Write to disk the content of 'attributeName' return True or False depending on something was written """ attributeContent = self.getAttribute(attributeName) if attributeContent == None: return False file = Ioread() file.writeFileContent(outputFileName, attributeContent) return True
def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in sentencesDict.items(): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath,strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
class AsrtSubprocess(): """An utility class to group methods. """ logger = logging.getLogger("Asrt.AsrtSubprocess") @staticmethod def execute(commandList, logPath, outFileName=None, errFileName=None): """Wrapper to execute a sub process. """ #Make sure the directory exists MyFile.checkDirExists(logPath) stdout, stderr, retCode = None, None, 0 try: #Default to one log p = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) if errFileName is not None: p = subprocess.Popen(commandList, stdout=subprocess.PIPE, stderr=subprocess.PIPE) #Run the subprocess stdout, stderr = p.communicate() retCode = p.poll() except Exception, e: AsrtSubprocess.logger.critical("Subprocess error: %s" % str(e)) errorMessage = str(commandList) + "\n" + \ "------------ Begin stack ------------\n" + \ traceback.format_exc().rstrip() + "\n" + \ "------------ End stack --------------" print errorMessage #Make sure the trace is logged if stderr is None: stderr = errorMessage else: stderr += errorMessage retCode = 1 #Now log results #It is important to be ouside exception management as we #still want to log what happened io = Ioread() if stdout != None and len(stdout) > 0 and outFileName != None: io.writeFileContent("%s/%s" % (logPath, outFileName), unicode(stdout, 'utf-8')) if stderr != None and len(stderr) > 0 and errFileName != None: io.writeFileContent("%s/%s" % (logPath, errFileName), unicode(stderr, 'utf-8')) return retCode, stdout, stderr
def applyRegexes(inputFile, outputFile, regularFile): """Apply the regular expressions contained in 'regularFile'. params: - inputFile : a text file in 'utf-8' encoding - outputFile : the result text file in 'utf-8' encoding - regularFile : the file containing the regular expressions to apply. """ regexFormula = RegularExpressionFormula(rulesFile=regularFile) io = Ioread() fd = io.openFile(inputFile) count, linesList = 0, [] #Read first line l = fd.readline() while l != "": l = l.rstrip().strip() #Remove punctuation using regular expressions linesList.append(regexFormula.apply(l, FRENCH)) count += 1 if count % 50000 == 0: print "Processed %d values" % count #Read next line l = fd.readline() io.closeFile(fd) strContent = u"\n".join(linesList) io.writeFileContent(outputFile, strContent)
def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in sentencesDict.items(): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath, strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)
class TestIoread(unittest.TestCase): logger = logging.getLogger("Asrt.TestIoread") testFile = scriptsDir + "/resources/ioread_utf8.txt" testFileCSV = scriptsDir + "/resources/ioread_utf8.csv" testsString = [ """Utf-8 test\nLatin characters é à ä\nNon latin characters 镕\n""", """Non latin characters 镕""" ] testList = [[ 'Utf-8 test', 'Latin characters é à ä', 'Non latin characters 镕' ]] def setUp(self): self.ioread = Ioread() ############ # Tests # def testOpenFile(self): try: fd = self.ioread.openFile(self.testFile) self.ioread.closeFile(fd) except Exception: self.fail("testOpenFile raised ExceptionType unexpectedled") def testReadFileContent(self): strContent = self.ioread.readFileContent(self.testFile) self.assertEquals(self.testsString[0], strContent) def testReadFileContentList(self): strContentList = self.ioread.readFileContentList(self.testFile) self.assertEquals(3, len(strContentList)) self.assertEquals(self.testsString[1], strContentList[2]) def testReadCSV(self): strContentList = self.ioread.readCSV(self.testFileCSV) self.assertEquals(1, len(strContentList)) self.assertEquals(strContentList, self.testList) def testWriteFileContent(self): strContent = self.testsString[0] self.ioread.writeFileContent(TEMPDIRUNITTEST + "/test.txt", strContent) readStrContent = self.ioread.readFileContent(self.testFile) self.assertEquals(strContent, readStrContent)
def loadFromFile(regexFile): """Load regular expressions from a csv file. The file is assumed to be in CSV file format with tabs as fields separators and no quotes around fields. File format is: matching pattern, substitution pattern, regex type (substitution = 0, deletion = 1), comments param regexFile: a csv file return a compiled regular expression list with their matching patterns """ RegexList.logger.info("Load regular expression from %s" % regexFile) io = Ioread() regexList = io.readCSV(regexFile, '\t') substitutionPatternList = RegexList.removeComments(regexList[1:]) RegexList.logger.info("Done loading regular expressions") return substitutionPatternList
def loadFromFile(regexFile): """Load regular expressions from a csv file. The file is assumed to be in CSV file format with tabs as fields separators and no quotes around fields. File format is: matching pattern, substitution pattern, regex type (substitution = 0, deletion = 1), comments param regexFile: a csv file return a compiled regular expression list with their matching patterns """ RegexList.logger.info("Load regular expression from %s" % regexFile) io = Ioread() regexList = io.readCSV(regexFile,'\t') substitutionPatternList = RegexList.removeComments(regexList[1:]) RegexList.logger.info("Done loading regular expressions") return substitutionPatternList
def replaceFile(self, input_file, output_file = None): """Replace the punctuation in the file named [input_file] and return the result in the file named [output_file] input_file Input file to be used as source output_file Output file (if None return Text return True if success False is failure in case output_file is present output_text if not present """ try: io = Ioread() input_text = io.readFileContent(input_file) if output_file : f = open(output_file, "w") f.write(self.replaceText(input_text)) return True else: return self.replaceText(input_text) except Exception, e : print("exception<" + input_file + "> : " + str(e), file=sys.stderr) if output_file : return False else : return None
def replaceFile(self, input_file, output_file=None): """Replace the punctuation in the file named [input_file] and return the result in the file named [output_file] input_file Input file to be used as source output_file Output file (if None return Text return True if success False is failure in case output_file is present output_text if not present """ try: io = Ioread() input_text = io.readFileContent(input_file) if output_file: f = open(output_file, "w") f.write(self.replaceText(input_text)) return True else: return self.replaceText(input_text) except Exception as e: print(("exception<" + input_file + "> : " + str(e))) if output_file: return False else: return None
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") # Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setExpandNumberInWords(expandNumberInWords) if language == 0: api.trainClassifier() # Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + "\n")
def setUp(self): self.ioread = Ioread()
setupLogging(logging.INFO, outputDir + "/data_preparation_log.txt") #Api setup api = DataPreparationAPI(None, outputDir) api.setRegexFile(regexFile) api.setFilterSentences(filterSentences) api.setFilterSentences2ndStage(filterSentences2ndStage) api.setLMModeling(lmModeling) api.setRemovePunctuation(removePunctuation) api.setVerbalizePunctuation(verbalizePunctuation) api.setSegmentWithNLTK(not rawSeg) api.setKeepNewWords(keepNewWords) if language == 0: api.trainClassifier() #Main processing MyFile.checkDirExists(outputDir) io = Ioread() inputList = io.readFileContentList(inputList) for i, f in enumerate(inputList): api.setInputFile(f) api.prepareDocument(language) strUnformatted = api.getCleanedText() outputFile = "%s/%s.lab" % (outputDir, os.path.splitext(os.path.basename(f))[0]) io.writeFileContent(outputFile, strUnformatted + u"\n")
def getTestList(self, strFileName): """Get CSV content of 'strFileName'. """ io = Ioread() return io.readCSV(strFileName, delim='\t')