Ejemplo n.º 1
0
    def ScorePhonemes(self, source=[], target=[]):
        """Compare the phonemes of a source and target sentence and determine 
        which of the target items were correctly transcribed
    
        Returns:
            hits_phonemes (nested list): list of bools corresponding to the accuracy
            of each phoneme in the target list for each sentence
        Note:
        This scoring method has no word accuracy awareness. Phonemes from correctly input
        words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: )
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)        
        """
        if not source:
            source = self.source_phonemes
        if not target:
            target = self.target_phonemes

        self.source_matched = []
        hits = []
        for x, ttup in enumerate(target):
            tphon, twordnum, tword = zip(*ttup)
            stup = source[x]
            if not stup:
                hitlist = [False] * len(tphon)
                bPhonOut = ['-'] * len(tphon)
            else:
                sphon, swordnum, sword = zip(*stup)
                # Create sequences to be aligned.
                a = Sequence(tphon)
                b = Sequence(sphon)

                # Create a vocabulary and encode the sequences.
                v = Vocabulary()
                aEncoded = v.encodeSequence(a)
                bEncoded = v.encodeSequence(b)

                # Create a scoring and align the sequences using global aligner.
                scoring = SimpleScoring(2, -1)
                aligner = GlobalSequenceAligner(scoring, -2)
                score, encodeds = aligner.align(aEncoded,
                                                bEncoded,
                                                backtrace=True)
                encoded = encodeds[0]

                #Score based only on hits vs misses, insertions are ignored
                notInsert = encoded[:][0] != 0
                nonInsertMatched = encoded[notInsert][:]

                #Find the alignment in the target sequence
                aSeq = nonInsertMatched[:][0]
                bSeq = nonInsertMatched[:][1]

                #Label all items not aligned to the target as false
                hitlist = []
                y = 0
                for y in range(0, len(aEncoded) - len(aSeq) + 1):
                    aChunk = aEncoded[y:y + len(aSeq)]
                    #print aChunk
                    if sum(aChunk - aSeq) == 0:
                        break
                hitlist.extend([False] * (y))
                hitlist.extend(list(aSeq - bSeq == 0))
                hitlist.extend([False] * (len(aEncoded) - y - len(aSeq)))
                #Export the target aligned phonemes of the source sequence
                bPhons = np.zeros(len(aEncoded), int)
                bPhons[y:y + len(bSeq)] = bSeq
                bPhonOut = np.array(v.elements())[bPhons].tolist()
            hits.append(hitlist)
            self.source_matched.append(bPhonOut)
            self.hits_phonemes = hits
Ejemplo n.º 2
0
    def ScoreWords(self):
        """Aligns the words of the source sentence to match the target sentence
        to determine hit vs missed words
    
        Returns:
           hits (nested list): The target [0] and source [1] sentences in a nested list 
    
        Note:
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)
        """
        target = self.target
        source = self.source
        self.source_matchWords = []
        hits = []
        wscore = np.empty(0)
        for tnum, tsent in enumerate(target):
            ssent = source[tnum]
            # Create sequences to be aligned.
            a = Sequence(tsent.split())
            b = Sequence(ssent.split())

            # Create a vocabulary and encode the sequences.
            v = Vocabulary()
            aEncoded = v.encodeSequence(a)
            bEncoded = v.encodeSequence(b)

            # Create a scoring and align the sequences using global aligner.
            scoring = SimpleScoring(5, -1)
            aligner = GlobalSequenceAligner(scoring, -1)
            score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
            encoded = encodeds[0]

            #Score based only on hits vs misses, insertions are ignored
            notInsert = encoded[:][0] != 0
            nonInsertMatched = encoded[notInsert][:]

            #Find the alignment in the target sequence
            aSeq = nonInsertMatched[:][0]
            bSeq = nonInsertMatched[:][1]

            #Label all items not aligned to the target as false
            hitlist = []
            x = 0
            for x in range(0, len(aEncoded) - len(aSeq) + 1):
                aChunk = aEncoded[x:x + len(aSeq)]
                #print aChunk
                if sum(aChunk - aSeq) == 0:
                    break
            hitlist.extend([False] * (x))
            hitlist.extend(list(aSeq - bSeq == 0))
            hitlist.extend([False] * (len(aEncoded) - x - len(aSeq)))
            #Export the target aligned words of the source sequence
            bWords = np.zeros(len(aEncoded), int)
            bWords[x:x + len(bSeq)] = bSeq
            bWordOut = np.array(v.elements())[bWords].tolist()
            hits.append(hitlist)
            iwscore = sum(hitlist) * 100 / float(len(hitlist))
            wscore = np.hstack([wscore, iwscore])
            print bWordOut
            self.source_matchWords.append(bWordOut)
            self.hits = hits
            self.wscore = wscore