def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2): ''' args: a: list of words b: list of words matchScore: num mismatchScore: num gapScore: num Returns: o/w returns list of tuples with score and top alignments Description: helper function for finding alignments given a list of words ''' # Create a vocabulary and encode the sequences. a = a[0] b = b[0] seq1 = Sequence(a) seq2 = Sequence(b) v = Vocabulary() aEncoded = v.encodeSequence(seq1) bEncoded = v.encodeSequence(seq2) # Create a scoring and align the sequences using local aligner. scoring = SimpleScoring(matchScore, mismatchScore) aligner = LocalSequenceAligner(scoring, gapScore) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds] return [(a.score, list(a.first), list(a.second)) for a in alignments]
def align(self, seq1, seq2): """ Goal: Align seq2 (automatically detected conditions) with seq1 (truth conditions) and return the best alignment """ print("len(truth_conditions) = {}, len(detected_conditions) = {}".format(len(seq1), len(seq2))) from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence(seq1) b = Sequence(seq2) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() assert(alignment.percentIdentity() >= 97.0) first, second = list(alignment.first), list(alignment.second) break return second
def align(s1, s2): # Create sequences to be aligned. a = Sequence(s1.split()) b = Sequence(s2.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] alignment = v.decodeSequenceAlignment(encoded) correct_words = [] offset = 0 for i, (x, y) in enumerate(encoded): if x == y: correct_words.append(a[i - offset]) elif x == 0: offset += 1 return correct_words
def align_ref_long(hyp, ref): ''' Aligns a ref to a sausage-aligned hype using the align library ''' # align ref to hyp sr = Sequence(ref) sh = Sequence(hyp) v = Vocabulary() rEncoded = v.encodeSequence(sr) hEncoded = v.encodeSequence(sh) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True) # Iterate over optimal alignments and print them. alignment = v.decodeSequenceAlignment(encodeds[0]) ref_align_raw = [token[0] for token in alignment if token[0] != '-'] ref_align = [] for token in ref_align_raw: if token == '-': ref_align.append(delete_token) else: ref_align.append(token) for i in range(len(hyp) - len(ref_align_raw)): ref_align.append(delete_token) return ref_align
def align(sequence1, sequence2): # This is encoded because the aligner uses the dasy as a gap element sequence1 = ['<DASH />' if word == '-' else word for word in sequence1] sequence2 = ['<DASH />' if word == '-' else word for word in sequence2] # Create sequences to be aligned. a = Sequence(sequence1) b = Sequence(sequence2) #print(22) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) #print(33) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) #print(99) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) #print(34) # Create alignment object and return it alignment = v.decodeSequenceAlignment(encodeds[0]) return alignment
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a,b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align(trace1, trace2): # Create sequences to be aligned. a = Sequence(trace1) b = Sequence(trace2) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) align = str(alignment) # Convert aligned sequences into list # escaping multiple characters seq_size = len(align) half_size = seq_size / 2 # First Half s1 = align_to_list(align, 0, half_size, 4) # Second Half s2 = align_to_list(align, half_size, seq_size, 4) # return the lists as result return s1, s2
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a, b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def align_pair( first: NamedSequence, second: NamedSequence, vocabulary: Vocabulary, ) -> AlignmentResult: scoring = EblScoring(vocabulary) aligner = GlobalSequenceAligner(scoring, True) score, alignments = aligner.align(first.sequence, second.sequence, backtrace=True) return AlignmentResult( score, first, second, [vocabulary.decodeSequenceAlignment(encoded) for encoded in alignments], )
def score_align(x, y): a = Sequence(x) b = Sequence(y) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) pI = 0.0 for e in encodeds: alignment = v.decodeSequenceAlignment(e) pI = max(pI, alignment.percentIdentity()) return 1 - pI / 100.0
def score_align(x, y): a = Sequence(x) b = Sequence(y) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) pI = 0.0 for e in encodeds: alignment = v.decodeSequenceAlignment(e) pI = max(pI, alignment.percentIdentity()) return 1 - pI/100.0
def align(self, word, error): vocab = Vocabulary() a = vocab.encodeSequence(Sequence(word)) b = vocab.encodeSequence(Sequence(error)) score, encodings = self.aligner.align(a, b, backtrace=True) # Choose the highest-score alignment. score = -sys.maxsize best_alignment = None for encoding in encodings: alignment = vocab.decodeSequenceAlignment(encoding) if alignment.score > score: best_alignment = alignment score = alignment.score return best_alignment.first, best_alignment.second
def getAlignment(timit, utterance): tim = list() for li in timit: for ph in li: tim.append(ph) a=Sequence(tim) b=Sequence(utterance) v=Vocabulary() aEnc=v.encodeSequence(a) bEnc=v.encodeSequence(b) scoring=SimpleScoring(2,-1) aligner=GlobalSequenceAligner(scoring,-2) score,encodeds= aligner.align(aEnc,bEnc,backtrace=True) for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) return alignment
def match_word_sorted(code1, code2): """return the max scored alignment between the two input codes""" list1 = code1.split(" ") list2 = code2.split(" ") set1 = set(list1) set2 = set(list2) common_words = set1 & set2 try: common_words.remove("") except: pass words_to_index = {} for word in common_words: in1 = list1.index(word) in2 = list2.index(word) words_to_index[word] = (in1, in2) sorted1 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][0])).keys() sorted2 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][1])).keys() a = Sequence(sorted1) b = Sequence(sorted2) v = Vocabulary() a_encoded = v.encodeSequence(a) b_encoded = v.encodeSequence(b) scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE) aligner = GlobalSequenceAligner(scoring, GAP_SCORE) score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True) max_score = 0 for i, encoded in enumerate(encoders): alignment = v.decodeSequenceAlignment(encoded) if alignment.score > max_score: max_score = alignment.score return max_score
def text_to_text_alignment_and_score(text_ref, text_pred): """ Find a word to word alignment between two texts, considering the first is the reference and the second the predicted :param text_ref: text reference :param text_pred: predicted text :return: """ text_ref = text_ref.lower() text_pred = text_pred.lower() iterable = [".", ","] # convert the reference text in order not to contain , and (junk characters) translation_map = str.maketrans(to_translation_map(iterable)) text_ref = text_ref.translate(translation_map) # Create sequences to be aligned. a = Sequence(text_ref.split()) b = Sequence(text_pred.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() a_enc = v.encodeSequence(a) b_enc = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(1, 0) aligner = GlobalSequenceAligner(scoring, 0) f, score, encodeds = aligner.align(a_enc, b_enc, text_ref.split(), text_pred.split(), backtrace=True) # get the first alignment if exists: #print(encodeds[0]) print(encodeds) if len(encodeds[0]) > 0: alignment = v.decodeSequenceAlignment(encodeds[0]) print(alignment) ##fix first and last missing words of asr text list_asr = [] list_pred = [] for word in text_pred.split(): if word != alignment.second.elements[0]: list_asr.append(word) list_pred.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_pred.split()): if word != alignment.second.elements[-1]: list_asr = [word] + list_asr list_pred.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #fix first and last missing words of reference text list_asr = [] list_pred = [] for word in text_ref.split(): if word != alignment.first.elements[0]: list_pred.append(word) list_asr.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_ref.split()): if word != alignment.first.elements[-1]: list_pred = [word] + list_asr list_asr.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #print(alignment.second.elements) #print(alignment.first.elements) print(alignment) rec = alignment.score * 100 / len(text_ref.split()) pre = alignment.score * 100 / len(text_pred.split()) else: alignment = [] rec, pre = 0, 0 return alignment, rec, pre
def test_utterance_transcriptions(self): print('Checking utterance transcriptions...') split_directory = self.corpus.split_directory() model_directory = self.trainer.align_directory with mp.Pool(processes=self.corpus.num_jobs) as pool: jobs = [(self, x) for x in range(self.corpus.num_jobs)] results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs] output = [p.get() for p in results] print('Utterance FSTs compiled!') print('Decoding utterances (this will take some time)...') results = [pool.apply_async(test_utterances_func, args=i) for i in jobs] output = [p.get() for p in results] print('Finished decoding utterances!') word_mapping = self.dictionary.reversed_word_mapping v = Vocabulary() errors = {} for job in range(self.corpus.num_jobs): text_path = os.path.join(split_directory, 'text.{}'.format(job)) texts = load_scp(text_path) aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job))) with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf: for utt, line in sorted(aligned_int.items()): text = [] for t in line: text.append(word_mapping[int(t)]) outf.write('{} {}\n'.format(utt, ' '.join(text))) ref_text = texts[utt] if len(text) < len(ref_text) - 7: insertions = [x for x in text if x not in ref_text] deletions = [x for x in ref_text if x not in text] else: aligned_seq = Sequence(text) ref_seq = Sequence(ref_text) alignedEncoded = v.encodeSequence(aligned_seq) refEncoded = v.encodeSequence(ref_seq) scoring = SimpleScoring(2, -1) a = GlobalSequenceAligner(scoring, -2) score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True) insertions = [] deletions = [] for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) for i, f in enumerate(alignment.first): s = alignment.second[i] if f == '-': insertions.append(s) if s == '-': deletions.append(f) if insertions or deletions: errors[utt] = (insertions, deletions, ref_text, text) if not errors: message = 'There were no utterances with transcription issues.' else: out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv') with open(out_path, 'w') as problemf: problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n') for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(), key=lambda x: -1 * ( len(x[1][1]) + len(x[1][2]))): problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions), ' '.join(ref_text), ' '.join(text))) message = 'There were {} of {} utterances with at least one transcription issue. '\ 'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path) print(self.transcription_analysis_template.format(message))
a = Sequence('amazing'.split()) b = Sequence('what a amazing disappointingly bad day'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() print from alignment.sequence import Sequence, GAP_ELEMENT from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner large_string = "thelargemanhatanproject is a great project in themanhattincity" query_string = "manhattan" # Create sequences to be aligned. a = Sequence(large_string)
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() print