def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a, b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align_sequences(seq_a, seq_b): # Must escape '-' because alignment library uses it as a gap # marker. escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a] escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b] v = Vocabulary() encoded_a = v.encodeSequence(Sequence(escaped_seq_a)) encoded_b = v.encodeSequence(Sequence(escaped_seq_b)) scoring = SimpleScoring(matchScore=3, mismatchScore=-1) aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2) _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True) return encodeds[0]