예제 #1
0
파일: TextDocument.py 프로젝트: idiap/asrt
    def _addClusterText(self, utterance, languageId):
        """Add 'utterance' as a text cluster.

           param utterance: an utf-8 encoded string
        """
        utterance = utterance.strip()
        if len(utterance) > 0:
            c = TextCluster(self, utterance)
            c.setLanguage(languageId)
            self.addDocumentLine(c)
예제 #2
0
    def _addClusterText(self, utterance, languageId):
        """Add 'utterance' as a text cluster.

           param utterance: an utf-8 encoded string
        """
        utterance = utterance.strip()
        if len(utterance) > 0:
            c = TextCluster(self, utterance)
            c.setLanguage(languageId)
            self.addDocumentLine(c)
예제 #3
0
           param 'sentencesList': a list of text sentences
           param 'languageId'   : the language id for the sentences list
           param 'bEmpty'       : empty current document is set otherwise
                                  add to existing clusters 
        """
        if bEmpty:
            self.reset()

        #Add sentences as clusters
        for line in sentencesList:
            #Further sentence split to avoid long paragraphes
            for utterance in re.split(ur"\t|;|:|!|\?", line, flags=re.UNICODE):
                utterance = utterance.strip()
                if len(utterance) > 0:
                    c = TextCluster(self, utterance)
                    c.setLanguage(languageId)
                    self.addDocumentLine(c)

    def _getLanguage2ClustersDict(self):
        """Map languages with a list of clusters.

           return a dictionary with one entry per
                  language.
        """
        languageDict = {}
        for textCluster in self.listContent:
            clusterLanguageId = textCluster.getLanguageId()
            #First cluster
            if clusterLanguageId not in languageDict:
                languageDict[clusterLanguageId] = []
예제 #4
0
           param 'sentencesList': a list of text sentences
           param 'languageId'   : the language id for the sentences list
           param 'bEmpty'       : empty current document is set otherwise
                                  add to existing clusters 
        """
        if bEmpty:
            self.reset()

        #Add sentences as clusters
        for line in sentencesList:
            #Further sentence split to avoid long paragraphes
            for utterance in re.split(ur"\t|;|:|!|\?", line, flags=re.UNICODE):
                utterance = utterance.strip()
                if len(utterance) > 0:
                    c = TextCluster(self, utterance)
                    c.setLanguage(languageId)
                    self.addDocumentLine(c)

    def _getLanguage2ClustersDict(self):
        """Map languages with a list of clusters.

           return a dictionary with one entry per
                  language.
        """
        languageDict = {}
        for textCluster in self.listContent:
            clusterLanguageId = textCluster.getLanguageId()
            #First cluster
            if clusterLanguageId not in languageDict:
                languageDict[clusterLanguageId] = []