def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train()
def classifySentences(self): """Classify sentences by language (FRENCH or GERMAN, ITALIAN or ENGLISH). """ if self.classifier == None: self.classifier = WordClassifier() self.classifier.train() for textCluster in self.listContent: textCluster.classify(self.classifier)
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.removePunctuation = False self.verbalizePunctuation = False self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set the acronyms to be used. param acronymList: a list of the following form: ['matching pattern', 'substitution', 'type', 'language id'] """ substitutionList = [] #Skip header for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0],row[3])) else: substitutionList.append((row[0],row[1],row[2],row[3])) self.substitutionRegexFormula.setSubstitutionPatternList(substitutionList) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def prepareDocument(self, language = 0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language> 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir) if self.inputFile != None: self.logger.info("Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % (str(e), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc
class TextDocument(Document): """A text document. """ logger = logging.getLogger("Asrt.TextDocument") CONVERT_COMMAND = ['pdftotext', '-raw', '-layout', '-enc', 'UTF-8', '-eol', 'unix', '-nopgbrk'] MERGECLUSTERSEP = u"\n" DIGITANDDOTREGEX = u"( |^)([0-9]{1,2})[.]( |$)" DIGITANDDOTSUB = u"\g<1>\g<2>.\g<3>" #Do not put a ; for character entity, otherwise #sentence segmentation is ocurring DIGITANDENTITYREGEX = u"( |^)([0-9]{1,2}).( |$)" DIGITANDENTITYSUB = u"\g<1>\g<2>.\g<3>" ######################## # Default constructor # def __init__(self, source, languageId, regexSubstitutionFormula, regex_filter_list, logDir, segmentWithNLTK, keepNewWords): Document.__init__(self, source) self.languageId = languageId self.regexSubstitutionFormula = regexSubstitutionFormula self.regex_filter_list = regex_filter_list self.logDir = logDir self.classifier = None self.segmentWithNLTK = segmentWithNLTK self.keepNewWords = keepNewWords ######################## #Getter and setters # def setClassifier(self, classifier): """Set the language classifier. It assumes it has been trained. """ self.classifier = classifier def setSentencesLanguage(self, languageId): """Language is known. param 'languageId': a value beetween 0-4 unknown : 0 french : 1 german : 2 english : 3 italian : 4 """ for textCluster in self.listContent: textCluster.setLanguage(languageId) ######################## #Interface # def loadDocumentAsSentences(self, tempDir): """Convert to text, remove new lines and segment into sentences using NLTK toolkit. """ #Pdf to text tempFileName = self.convertToText(self.sourceFileName, tempDir, self.logDir) #Segment into sentences using NLTK toolkit self._loadTextDocumentAsSentences(tempFileName) #Delete temporary file MyFile(tempFileName).removeFile(tempFileName) def loadAsSentences(self, strText): """Load the given text string as sentences. param strText: an utf-8 encoded string """ self._loadAsSentences(strText) def getCleanedText(self): """Get the cleaned text. """ textList = [] for textCluster in self.listContent: textList.append(textCluster.getTextSentence()) return self.MERGECLUSTERSEP.join(textList) def getCleanedTextPerLanguage(self): """Get the classified text per language. return a dictionary of utf-8 text. """ textDict = {} for textCluster in self.listContent: languageId = textCluster.getLanguageId() if languageId not in textDict: textDict[languageId] = [] textDict[languageId].append(textCluster.getTextSentence()) #One string per language resultDict = {} for k, textList in textDict.items(): resultDict[k] = self.MERGECLUSTERSEP.join(textList) return resultDict def cleanTextSentences(self): """Use a set of regex rules to prepare the sentences. """ self._applyAllClusters('clean') def normalizeTextSentences(self): """Use a set of regex rules to prepare the sentences. First group clusters per languages and then apply language based normalization. """ #Get cluster per language lang2clusterDict = self._getLanguage2ClustersDict() bEmpty = True #Normalize text per language for languageId, clusterList in lang2clusterDict.items(): #Read all cluster texts textList = [] for textCluster in clusterList: textList.append(textCluster.getTextSentence()) #Join all text allText = self.MERGECLUSTERSEP.join(textList) #Normalize text allText = self.regexSubstitutionFormula.apply(allText, languageId) sentencesList = allText.split(self.MERGECLUSTERSEP) #Add and set language id self._addSentences(sentencesList, languageId, bEmpty) if bEmpty: bEmpty = False def prepareLM(self): """Prepare text sentences for N-Gram modeling. """ self._applyAllClusters("prepareLM") def removeTextPunctuation(self): """Remove punctuation symbols. """ self._applyAllClusters("removeTextPunctuation") def verbalizeTextPunctuation(self): """Transform punctuation symbols to words. Currently only implemented for French. """ self._applyAllClusters("verbalizeTextPunctuation") def filterTextSentences(self): """Filter sentences after cleaning. Uses: - sentence length - number of digit groups - user defined rules """ filteredContentList = [] for textCluster in self.listContent: if textCluster.isValid(): filteredContentList.append(textCluster) self.listContent = filteredContentList filteredContentList = [ ] def filterTextSentences2ndStage(self): """Filter sentences before LM preparation. Remove web address and check German orthography https://en.wikipedia.org/wiki/German_orthography . """ filteredContentList = [] for textCluster in self.listContent: if textCluster.isValid2ndStage(): filteredContentList.append(textCluster) self.listContent = filteredContentList filteredContentList = [ ] def classifySentences(self): """Classify sentences by language (FRENCH or GERMAN, ITALIAN or ENGLISH). """ if self.classifier == None: self.classifier = WordClassifier() self.classifier.train() for textCluster in self.listContent: textCluster.classify(self.classifier) def display(self): """Display document content. """ for textCluster in self.listContent: print textCluster ######################## #Implementation # def _loadTextDocumentAsSentences(self, filePath): """Load a text document and segment it into sentences using NLTK. Initial new lines are first removed. """ io = Ioread() #One string for the whole #text file as utf-8 string data = io.nltkRead(filePath) self._loadAsSentences(data) def _loadAsSentences(self, strText): """Load the given text as sentences. Algorithm is: - New lines removal - Problematic periods replacement - Sentences segmentation with nltk - Problematic periods restauration param strText: an utf-8 encoded string """ tokenizer_path = FRENCH_PICKLE_FOLDER if self.languageId == 2: tokenizer_path = GERMAN_PICKLE_FOLDER sentences = [] if self.segmentWithNLTK: TextDocument.logger.info("Segment with NLTK") #Trim new lines strText = self._replaceNewLines(strText) #Problematic periods replacement strText = self._replaceProblematicPeriods(strText) #Nltk segmentation sentences = self._segmentIntoSentences(strText, tokenizer_path) #Problematic periods restauration for i, s in enumerate(sentences): sentences[i] = self._replaceProblematicPeriods(s, forward=False) else: TextDocument.logger.info("Segment with new lines") sentences = strText.split(u"\n") #Make text clusters with unknown language id self._addSentences(sentences) TextDocument.logger.info("Loaded %d raw sentences!" % len(sentences)) def _applyAllClusters(self, method): """Apply 'method' to all clusters. """ for textCluster in self.listContent: getattr(textCluster, method)() def _replaceNewLines(self, data): """Replace new lines by spaces. New lines are not considered at the end of a sentence. param data: an utf-8 encoded string """ #Last sentence word splited into two data = re.sub(ur"-\n", u"", data, flags=re.UNICODE) return re.sub(ur"\n", u" ", data, flags=re.UNICODE) def _replaceProblematicPeriods(self, data, forward=True): """Convert dots preceded from a number and followed by a space into an html entity. If forward is set to False, it will convert from html entity to dots. This escaping is done in order to prevent segmenting sentences on numbers. """ if not forward: return re.sub(self.DIGITANDENTITYREGEX, self.DIGITANDDOTSUB, data, flags=re.UNICODE) return re.sub(self.DIGITANDDOTREGEX, self.DIGITANDENTITYSUB, data, flags=re.UNICODE) def _segmentIntoSentences(self, data, tokenizer_path): """Replace current content by sentences. The sentences segmentation is done using the french pickle of the NLTK toolkit. param data: an utf-8 encoded string """ try: #Get the french tokenizer tokenizer = nltk.data.load(tokenizer_path) #The actual job sentences = tokenizer.tokenize(data) except Exception, e: TextDocument.logger.critical("Tokenizer error: " + str(e)) raise Exception("Tokenizer error: " + self.tokenizer_path) return sentences
class TextDocument(Document): """A text document. """ logger = logging.getLogger("Asrt.TextDocument") CONVERT_COMMAND = [ 'pdftotext', '-raw', '-layout', '-enc', 'UTF-8', '-eol', 'unix', '-nopgbrk' ] MERGECLUSTERSEP = u"\n" DIGITANDDOTREGEX = u"( |^)([0-9]{1,2})[.]( |$)" DIGITANDDOTSUB = u"\g<1>\g<2>.\g<3>" #Do not put a ; for character entity, otherwise #sentence segmentation is ocurring DIGITANDENTITYREGEX = u"( |^)([0-9]{1,2}).( |$)" DIGITANDENTITYSUB = u"\g<1>\g<2>.\g<3>" ######################## # Default constructor # def __init__(self, source, languageId, regexSubstitutionFormula, regex_filter_list, logDir, segmentWithNLTK, keepNewWords): Document.__init__(self, source) self.languageId = languageId self.regexSubstitutionFormula = regexSubstitutionFormula self.regex_filter_list = regex_filter_list self.logDir = logDir self.classifier = None self.segmentWithNLTK = segmentWithNLTK self.keepNewWords = keepNewWords ######################## #Getter and setters # def setClassifier(self, classifier): """Set the language classifier. It assumes it has been trained. """ self.classifier = classifier def setSentencesLanguage(self, languageId): """Language is known. param 'languageId': a value beetween 0-4 unknown : 0 french : 1 german : 2 english : 3 italian : 4 """ for textCluster in self.listContent: textCluster.setLanguage(languageId) ######################## #Interface # def loadDocumentAsSentences(self, tempDir): """Convert to text, remove new lines and segment into sentences using NLTK toolkit. """ #Pdf to text tempFileName = self.convertToText(self.sourceFileName, tempDir, self.logDir) #Segment into sentences using NLTK toolkit self._loadTextDocumentAsSentences(tempFileName) #Delete temporary file MyFile(tempFileName).removeFile(tempFileName) def loadAsSentences(self, strText): """Load the given text string as sentences. param strText: an utf-8 encoded string """ self._loadAsSentences(strText) def getCleanedText(self): """Get the cleaned text. """ textList = [] for textCluster in self.listContent: textList.append(textCluster.getTextSentence()) return self.MERGECLUSTERSEP.join(textList) def getCleanedTextPerLanguage(self): """Get the classified text per language. return a dictionary of utf-8 text. """ textDict = {} for textCluster in self.listContent: languageId = textCluster.getLanguageId() if languageId not in textDict: textDict[languageId] = [] textDict[languageId].append(textCluster.getTextSentence()) #One string per language resultDict = {} for k, textList in textDict.items(): resultDict[k] = self.MERGECLUSTERSEP.join(textList) return resultDict def cleanTextSentences(self): """Use a set of regex rules to prepare the sentences. """ self._applyAllClusters('clean') def normalizeTextSentences(self): """Use a set of regex rules to prepare the sentences. First group clusters per languages and then apply language based normalization. """ #Get cluster per language lang2clusterDict = self._getLanguage2ClustersDict() bEmpty = True #Normalize text per language for languageId, clusterList in lang2clusterDict.items(): #Read all cluster texts textList = [] for textCluster in clusterList: textList.append(textCluster.getTextSentence()) #Join all text allText = self.MERGECLUSTERSEP.join(textList) #Normalize text allText = self.regexSubstitutionFormula.apply(allText, languageId) sentencesList = allText.split(self.MERGECLUSTERSEP) #Add and set language id self._addSentences(sentencesList, languageId, bEmpty) if bEmpty: bEmpty = False def prepareLM(self): """Prepare text sentences for N-Gram modeling. """ self._applyAllClusters("prepareLM") def removeTextPunctuation(self): """Remove punctuation symbols. """ self._applyAllClusters("removeTextPunctuation") def verbalizeTextPunctuation(self): """Transform punctuation symbols to words. Currently only implemented for French. """ self._applyAllClusters("verbalizeTextPunctuation") def filterTextSentences(self): """Filter sentences after cleaning. Uses: - sentence length - number of digit groups - user defined rules """ filteredContentList = [] for textCluster in self.listContent: if textCluster.isValid(): filteredContentList.append(textCluster) self.listContent = filteredContentList def classifySentences(self): """Classify sentences by language (FRENCH or GERMAN, ITALIAN or ENGLISH). """ if self.classifier == None: self.classifier = WordClassifier() self.classifier.train() for textCluster in self.listContent: textCluster.classify(self.classifier) def display(self): """Display document content. """ for textCluster in self.listContent: print textCluster ######################## #Implementation # def _loadTextDocumentAsSentences(self, filePath): """Load a text document and segment it into sentences using NLTK. Initial new lines are first removed. """ io = Ioread() #One string for the whole #text file as utf-8 string data = io.nltkRead(filePath) self._loadAsSentences(data) def _loadAsSentences(self, strText): """Load the given text as sentences. Algorithm is: - New lines removal - Problematic periods replacement - Sentences segmentation with nltk - Problematic periods restauration param strText: an utf-8 encoded string """ tokenizer_path = FRENCH_PICKLE_FOLDER if self.languageId == 2: tokenizer_path = GERMAN_PICKLE_FOLDER sentences = [] if self.segmentWithNLTK: TextDocument.logger.info("Segment with NLTK") #Trim new lines strText = self._replaceNewLines(strText) #Problematic periods replacement strText = self._replaceProblematicPeriods(strText) #Nltk segmentation sentences = self._segmentIntoSentences(strText, tokenizer_path) #Problematic periods restauration for i, s in enumerate(sentences): sentences[i] = self._replaceProblematicPeriods(s, forward=False) else: TextDocument.logger.info("Segment with new lines") sentences = strText.split(u"\n") #Make text clusters with unknown language id self._addSentences(sentences) TextDocument.logger.info("Loaded %d raw sentences!" % len(sentences)) def _applyAllClusters(self, method): """Apply 'method' to all clusters. """ for textCluster in self.listContent: getattr(textCluster, method)() def _replaceNewLines(self, data): """Replace new lines by spaces. New lines are not considered at the end of a sentence. param data: an utf-8 encoded string """ #Last sentence word splited into two data = re.sub(ur"-\n", u"", data, flags=re.UNICODE) return re.sub(ur"\n", u" ", data, flags=re.UNICODE) def _replaceProblematicPeriods(self, data, forward=True): """Convert dots preceded from a number and followed by a space into an html entity. If forward is set to False, it will convert from html entity to dots. This escaping is done in order to prevent segmenting sentences on numbers. """ if not forward: return re.sub(self.DIGITANDENTITYREGEX, self.DIGITANDDOTSUB, data, flags=re.UNICODE) return re.sub(self.DIGITANDDOTREGEX, self.DIGITANDENTITYSUB, data, flags=re.UNICODE) def _segmentIntoSentences(self, data, tokenizer_path): """Replace current content by sentences. The sentences segmentation is done using the french pickle of the NLTK toolkit. param data: an utf-8 encoded string """ try: #Get the french tokenizer tokenizer = nltk.data.load(tokenizer_path) #The actual job sentences = tokenizer.tokenize(data) except Exception, e: TextDocument.logger.critical("Tokenizer error: " + str(e)) raise Exception("Tokenizer error: " + self.tokenizer_path) return sentences
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.removePunctuation = False self.verbalizePunctuation = False self.segmentWithNLTK = True self.keepNewWords = False self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) else: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getSubstitutionList(self): """Get the user defined substitution list. return a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ return self.substitutionRegexFormula.getSubstitutionPatterns() def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getValidationList(self): """Get the user defined validation list. return a four columns list of lists: [u'matching pattern', u'', u'-1', u'0'] """ validationList = [] for pattern, regexType in self.validationPatternList: validationList.append(pattern, u"", regexType, u"0") return validationList def setValidationList(self, regexList): """Set the user regexes validation list. Filter 'regexList' for validation rules only. param regexList: a four columns list of lists: ['matching pattern', 'substitution', 'type', 'language id'] """ self.validationPatternList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def setSegmentWithNLTK(self, segmentWithNLTK): self.segmentWithNLTK = segmentWithNLTK def setKeepNewWords(self, keepNewWords): self.keepNewWords = keepNewWords def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def resetAllPatterns(self): """Empty all validation and substitution regexes. """ self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] def prepareDocument(self, language=0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language > 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir, self.segmentWithNLTK, self.keepNewWords) if self.inputFile != None: self.logger.info( "Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info( "Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() except Exception, e: errorMessage = "An error as occurred when importing sentences: %s\n%s" % \ (getByteString(e.message), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc
class DataPreparationAPI(): """Import sentences from one file, classifying sentences into languages. """ logger = logging.getLogger("Asrt.DataPreparationAPI") def __init__(self, inputFile, outputDir): """Default constructor. """ self.inputFile = inputFile self.outputDir = outputDir self.tempDir = outputDir self.formattedText = None self.debug = False self.regexFile = None self.lmModeling = False self.filterSentences = False self.filterTextSentences2ndStage = False self.removePunctuation = False self.verbalizePunctuation = False self.segmentWithNLTK = True self.expandNumberInWords = True self.doc = None self.wordClassifier = None self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] ##################### #Getters and setters # def setInputFile(self, inputFile): self.inputFile = inputFile def setOutputDir(self, outputDir): self.outputDir = outputDir def setTempDir(self, tempDir): self.tempDir = tempDir def setFormattedText(self, formattedText): self.formattedText = formattedText def getCleanedText(self): if self.doc != None: return self.doc.getCleanedText() return "" def getCleanedTextPerLanguage(self): if self.doc != None: return self.doc.getCleanedTextPerLanguage() return "" def setDebugMode(self, debug): self.debug = debug def setRegexFile(self, regexFile): self.regexFile = regexFile def setRegexList(self, regexList): """Set both validation and substitution user regexes. param regexList: a list of the following form: [u'matching pattern', u'substitution', u'type', u'language id'] """ #Reset current lists self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] substitutionList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) else: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getSubstitutionList(self): """Get the user defined substitution list. return a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ return self.substitutionRegexFormula.getSubstitutionPatterns() def setSubstitutionList(self, regexList): """Set the user regexes substitution list. param regexList: a four columns list of lists: [u'matching pattern', u'substitution', u'type', u'language id'] """ self.substitutionRegexFormula = RegularExpressionFormula(None) substitutionList = [] for row in regexList: substitutionList.append((row[0], row[1], row[2], row[3])) self.substitutionRegexFormula.setSubstitutionPatternList( substitutionList) def getValidationList(self): """Get the user defined validation list. return a four columns list of lists: [u'matching pattern', u'', u'-1', u'0'] """ validationList = [] for pattern, regexType in self.validationPatternList: validationList.append([pattern, "", regexType, "0"]) return validationList def setValidationList(self, regexList): """Set the user regexes validation list. Filter 'regexList' for validation rules only. param regexList: a four columns list of lists: ['matching pattern', 'substitution', 'type', 'language id'] """ self.validationPatternList = [] for row in regexList: if int(row[2]) == VALIDATION_TYPE: self.validationPatternList.append((row[0], row[3])) def setLMModeling(self, modelNgram): self.lmModeling = modelNgram def setFilterSentences(self, filterSentences): self.filterSentences = filterSentences def setFilterSentences2ndStage(self, filterTextSentences2ndStage): self.filterTextSentences2ndStage = filterTextSentences2ndStage def setRemovePunctuation(self, removePunctuation): self.removePunctuation = removePunctuation def setVerbalizePunctuation(self, verbalizePunctuation): self.verbalizePunctuation = verbalizePunctuation def setSegmentWithNLTK(self, segmentWithNLTK): self.segmentWithNLTK = segmentWithNLTK def setExpandNumberInWords(self, expandNumberInWords): self.expandNumberInWords = expandNumberInWords def getDocument(self): """Get the underlying 'TextDocument'. """ return self.doc ##################### #Public interface # def trainClassifier(self): """Train the underlying classifier. """ if self.wordClassifier == None: self.logger.info("Prepare the word classifier ...") self.wordClassifier = WordClassifier() self.wordClassifier.train() def getRegexes(self): """Fetch validation and substitution regexes from csv file. """ #User did not specified rules if self.regexFile == None: return #Are regexes already loaded in API if self.substitutionRegexFormula.hasPatterns() or \ len(self.validationPatternList) > 0: return regexList = RegexList().loadFromFile(self.regexFile) self.setRegexList(regexList) def resetAllPatterns(self): """Empty all validation and substitution regexes. """ self.substitutionRegexFormula = RegularExpressionFormula(None) self.validationPatternList = [] def prepareDocument(self, language=0): """Segment the document into sentences and prepare them. param language: an int between 0-4 - unknown : 0 - french : 1 - german : 2 - english : 3 - italian : 4 """ if language > 4 or language < 0: raise Exception("Unknown language") #Done at the API level to share resources between #documents self.logger.info("Getting regexes") self.getRegexes() if self.substitutionRegexFormula.hasPatterns(): self.logger.info("Using following regexes substitution:\n" +\ str(self.substitutionRegexFormula.getSubstitutionPatterns()[:])) # str(self.substitutionRegexFormula.getSubstitutionPatterns()[0:3])) if len(self.validationPatternList) > 0: self.logger.info("Using following regexes for sentence validation:\n" +\ str(self.validationPatternList[0:3])) try: self.logger.info("Document file: %s" % self.inputFile) #The main document self.doc = TextDocument(self.inputFile, language, self.substitutionRegexFormula, self.validationPatternList, self.outputDir, self.segmentWithNLTK, self.expandNumberInWords) if self.inputFile != None: self.logger.info( "Load file, convert to text when pdf document") self.doc.loadDocumentAsSentences(self.tempDir) elif self.formattedText != None: self.logger.info("Load text string as sentences") self.doc.loadAsSentences(self.formattedText) else: raise Exception("No input file or text string provided!") #print self.doc.getCleanedText() #Control character and strip self.logger.info("Cleaning control characters") self.doc.cleanTextSentences() #print self.doc.getCleanedText() if language == 0: self.logger.info("Classifying sentences") self.doc.setClassifier(self.wordClassifier) self.doc.classifySentences() else: self.doc.setSentencesLanguage(language) #print self.doc.getCleanedText() #User's supplied regular expression if self.substitutionRegexFormula.hasPatterns(): self.logger.info( "Applying user regular expressions per language") self.doc.normalizeTextSentences() #print self.doc.getCleanedText() if self.filterSentences: self.logger.info("Filtering data") self.doc.filterTextSentences() #If LM option is selected, it will be done at #the prepareLM stage if self.removePunctuation and not self.lmModeling: self.doc.removeTextPunctuation() if self.verbalizePunctuation and not self.removePunctuation: self.doc.verbalizeTextPunctuation() #print self.doc.getCleanedText() #After language id has been set as it depends of #languages (i.e. numbers expansion) if self.lmModeling: self.logger.info("Preparing for language modeling") self.doc.prepareLM() if self.filterTextSentences2ndStage: if language == GERMAN: self.logger.info( "Filtering data - 2nd stage (remove web address and check German orthograph)" ) self.doc.filterTextSentences2ndStage() except Exception as e: errorMessage = "An error has occurred when importing sentences: %s\n%s" % \ (getByteString(e.message), self.inputFile) errorMessage = getErrorMessage(e, errorMessage) self.logger.critical(errorMessage) raise Exception(e) return self.doc def outputSentencesToFiles(self, outputDir): """Output the original sentences with language information to the 'outputFile' """ self.logger.info("Output results to language files.") sentencesDict = { FRENCH_LABEL: [], GERMAN_LABEL: [], ITALIAN_LABEL: [], ENGLISH_LABEL: [], UNKNOWN_LABEL: [] } self.appendDocumentSentences(self.doc, sentencesDict) self.outputPerLanguage(sentencesDict, outputDir) @staticmethod def appendDocumentSentences(textDocument, sentencesDict): """Update 'sentencesDict' with the 'textDocument' content. """ #Save all sentences for textCluster in textDocument.getListContent(): strSentence = textCluster.getTextSentence() currentLanguage = UNKNOWN_LABEL if textCluster.isFrench(): currentLanguage = FRENCH_LABEL elif textCluster.isGerman(): currentLanguage = GERMAN_LABEL elif textCluster.isItalian(): currentLanguage = ITALIAN_LABEL elif textCluster.isEnglish(): currentLanguage = ENGLISH_LABEL #strOut = u"<" + textDocument.sourceFileName + u">: " + strSentence strOut = strSentence.rstrip() sentencesDict[currentLanguage].append(strOut) @staticmethod def outputPerLanguage(sentencesDict, outputDir): """Output sentences in language files. """ io = Ioread() #Finally output to disk for resultLanguage, results in list(sentencesDict.items()): if len(results) > 0: DataPreparationAPI.logger.info("%d sentences found for: %s" % (len(results), resultLanguage)) strContent = "\n".join(results) strContent = strContent.rstrip() + "\n" outputPath = "%s/sentences_%s.txt" % (outputDir,\ resultLanguage) DataPreparationAPI.logger.info("Writing content to: %s" % outputPath) io.writeFileContent(outputPath, strContent) else: DataPreparationAPI.logger.info("No sentences found for: %s" % resultLanguage)