def _addClusterText(self, utterance, languageId): """Add 'utterance' as a text cluster. param utterance: an utf-8 encoded string """ utterance = utterance.strip() if len(utterance) > 0: c = TextCluster(self, utterance) c.setLanguage(languageId) self.addDocumentLine(c)
param 'sentencesList': a list of text sentences param 'languageId' : the language id for the sentences list param 'bEmpty' : empty current document is set otherwise add to existing clusters """ if bEmpty: self.reset() #Add sentences as clusters for line in sentencesList: #Further sentence split to avoid long paragraphes for utterance in re.split(ur"\t|;|:|!|\?", line, flags=re.UNICODE): utterance = utterance.strip() if len(utterance) > 0: c = TextCluster(self, utterance) c.setLanguage(languageId) self.addDocumentLine(c) def _getLanguage2ClustersDict(self): """Map languages with a list of clusters. return a dictionary with one entry per language. """ languageDict = {} for textCluster in self.listContent: clusterLanguageId = textCluster.getLanguageId() #First cluster if clusterLanguageId not in languageDict: languageDict[clusterLanguageId] = []