def ScorePhonemes(self, source=[], target=[]): """Compare the phonemes of a source and target sentence and determine which of the target items were correctly transcribed Returns: hits_phonemes (nested list): list of bools corresponding to the accuracy of each phoneme in the target list for each sentence Note: This scoring method has no word accuracy awareness. Phonemes from correctly input words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: ) Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ if not source: source = self.source_phonemes if not target: target = self.target_phonemes self.source_matched = [] hits = [] for x, ttup in enumerate(target): tphon, twordnum, tword = zip(*ttup) stup = source[x] if not stup: hitlist = [False] * len(tphon) bPhonOut = ['-'] * len(tphon) else: sphon, swordnum, sword = zip(*stup) # Create sequences to be aligned. a = Sequence(tphon) b = Sequence(sphon) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] y = 0 for y in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[y:y + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (y)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - y - len(aSeq))) #Export the target aligned phonemes of the source sequence bPhons = np.zeros(len(aEncoded), int) bPhons[y:y + len(bSeq)] = bSeq bPhonOut = np.array(v.elements())[bPhons].tolist() hits.append(hitlist) self.source_matched.append(bPhonOut) self.hits_phonemes = hits
def ScoreWords(self): """Aligns the words of the source sentence to match the target sentence to determine hit vs missed words Returns: hits (nested list): The target [0] and source [1] sentences in a nested list Note: Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ target = self.target source = self.source self.source_matchWords = [] hits = [] wscore = np.empty(0) for tnum, tsent in enumerate(target): ssent = source[tnum] # Create sequences to be aligned. a = Sequence(tsent.split()) b = Sequence(ssent.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(5, -1) aligner = GlobalSequenceAligner(scoring, -1) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] x = 0 for x in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[x:x + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (x)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - x - len(aSeq))) #Export the target aligned words of the source sequence bWords = np.zeros(len(aEncoded), int) bWords[x:x + len(bSeq)] = bSeq bWordOut = np.array(v.elements())[bWords].tolist() hits.append(hitlist) iwscore = sum(hitlist) * 100 / float(len(hitlist)) wscore = np.hstack([wscore, iwscore]) print bWordOut self.source_matchWords.append(bWordOut) self.hits = hits self.wscore = wscore