def align(trace1, trace2): # Create sequences to be aligned. a = Sequence(trace1) b = Sequence(trace2) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) align = str(alignment) # Convert aligned sequences into list # escaping multiple characters seq_size = len(align) half_size = seq_size / 2 # First Half s1 = align_to_list(align, 0, half_size, 4) # Second Half s2 = align_to_list(align, half_size, seq_size, 4) # return the lists as result return s1, s2
def align(s1, s2): # Create sequences to be aligned. a = Sequence(s1.split()) b = Sequence(s2.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] alignment = v.decodeSequenceAlignment(encoded) correct_words = [] offset = 0 for i, (x, y) in enumerate(encoded): if x == y: correct_words.append(a[i - offset]) elif x == 0: offset += 1 return correct_words
def align(seq1, seq2): s1 = Sequence(seq1) s2 = Sequence(seq2) v = Vocabulary() s1Encoded = v.encodeSequence(s1) s2Encoded = v.encodeSequence(s2) return s1Encoded, s2Encoded, v
def get_labels(self): """label each slot in the sausage (O=correct X=incorrect)""" if self.correct(): # everything is correct return ['O'] * self.num_slots() # align the ref and the best hyp a = Sequence(self.ref()) b = Sequence(self.best_hyp()) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = StrictGlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignment = v.decodeSequenceAlignment(encodeds[0]) # get labels according to alignment labels = [] for a, b in zip(alignment.first, alignment.second): if a == b or a == '-' and b == '*DELETE*': labels.append('O') else: labels.append('X') return labels
def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2): ''' args: a: list of words b: list of words matchScore: num mismatchScore: num gapScore: num Returns: o/w returns list of tuples with score and top alignments Description: helper function for finding alignments given a list of words ''' # Create a vocabulary and encode the sequences. a = a[0] b = b[0] seq1 = Sequence(a) seq2 = Sequence(b) v = Vocabulary() aEncoded = v.encodeSequence(seq1) bEncoded = v.encodeSequence(seq2) # Create a scoring and align the sequences using local aligner. scoring = SimpleScoring(matchScore, mismatchScore) aligner = LocalSequenceAligner(scoring, gapScore) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds] return [(a.score, list(a.first), list(a.second)) for a in alignments]
def align(sequence1, sequence2): # This is encoded because the aligner uses the dasy as a gap element sequence1 = ['<DASH />' if word == '-' else word for word in sequence1] sequence2 = ['<DASH />' if word == '-' else word for word in sequence2] # Create sequences to be aligned. a = Sequence(sequence1) b = Sequence(sequence2) #print(22) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) #print(33) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) #print(99) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) #print(34) # Create alignment object and return it alignment = v.decodeSequenceAlignment(encodeds[0]) return alignment
def score_align(x, y): a = Sequence(x) b = Sequence(y) v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) pI = 0.0 for e in encodeds: alignment = v.decodeSequenceAlignment(e) pI = max(pI, alignment.percentIdentity()) return 1 - pI / 100.0
def align_sequences(seq_a, seq_b): # Must escape '-' because alignment library uses it as a gap # marker. escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a] escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b] v = Vocabulary() encoded_a = v.encodeSequence(Sequence(escaped_seq_a)) encoded_b = v.encodeSequence(Sequence(escaped_seq_b)) scoring = SimpleScoring(matchScore=3, mismatchScore=-1) aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2) _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True) return encodeds[0]
def match_word_sorted(code1, code2): """return the max scored alignment between the two input codes""" list1 = code1.split(" ") list2 = code2.split(" ") set1 = set(list1) set2 = set(list2) common_words = set1 | set2 try: common_words.remove("") except: pass words1 = [] words2 = [] for word in common_words: words1 += index_word_pairs(word, list1) words2 += index_word_pairs(word, list2) sorted1 = sorted(words1, key=lambda t: t[1]) sorted2 = sorted(words2, key=lambda t: t[1]) a = Sequence(sorted1) b = Sequence(sorted2) v = Vocabulary() a_encoded = v.encodeSequence(a) b_encoded = v.encodeSequence(b) scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE) aligner = GlobalSequenceAligner(scoring, GAP_SCORE) score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True) max_score = 0 for i, encoded in enumerate(encoders): alignment = v.decodeSequenceAlignment(encoded) #print alignment #print 'Alignment score:', alignment.score #print 'Percent identity:', alignment.percentIdentity() if alignment.score > max_score: max_score = alignment.score return max_score
def match_word_sorted(code1, code2): """return the max scored alignment between the two input codes""" list1 = code1.split(" ") list2 = code2.split(" ") set1 = set(list1) set2 = set(list2) common_words = set1 & set2 try: common_words.remove("") except: pass words_to_index = {} for word in common_words: in1 = list1.index(word) in2 = list2.index(word) words_to_index[word] = (in1, in2) sorted1 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][0])).keys() sorted2 = OrderedDict(sorted(words_to_index.items(), key=lambda t: t[1][1])).keys() a = Sequence(sorted1) b = Sequence(sorted2) v = Vocabulary() a_encoded = v.encodeSequence(a) b_encoded = v.encodeSequence(b) scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE) aligner = GlobalSequenceAligner(scoring, GAP_SCORE) score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True) max_score = 0 for i, encoded in enumerate(encoders): alignment = v.decodeSequenceAlignment(encoded) if alignment.score > max_score: max_score = alignment.score return max_score
def ScorePhonemes(self, source=[], target=[]): """Compare the phonemes of a source and target sentence and determine which of the target items were correctly transcribed Returns: hits_phonemes (nested list): list of bools corresponding to the accuracy of each phoneme in the target list for each sentence Note: This scoring method has no word accuracy awareness. Phonemes from correctly input words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: ) Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ if not source: source = self.source_phonemes if not target: target = self.target_phonemes self.source_matched = [] hits = [] for x, ttup in enumerate(target): tphon, twordnum, tword = zip(*ttup) stup = source[x] if not stup: hitlist = [False] * len(tphon) bPhonOut = ['-'] * len(tphon) else: sphon, swordnum, sword = zip(*stup) # Create sequences to be aligned. a = Sequence(tphon) b = Sequence(sphon) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] y = 0 for y in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[y:y + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (y)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - y - len(aSeq))) #Export the target aligned phonemes of the source sequence bPhons = np.zeros(len(aEncoded), int) bPhons[y:y + len(bSeq)] = bSeq bPhonOut = np.array(v.elements())[bPhons].tolist() hits.append(hitlist) self.source_matched.append(bPhonOut) self.hits_phonemes = hits
def ScoreWords(self): """Aligns the words of the source sentence to match the target sentence to determine hit vs missed words Returns: hits (nested list): The target [0] and source [1] sentences in a nested list Note: Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9) """ target = self.target source = self.source self.source_matchWords = [] hits = [] wscore = np.empty(0) for tnum, tsent in enumerate(target): ssent = source[tnum] # Create sequences to be aligned. a = Sequence(tsent.split()) b = Sequence(ssent.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(5, -1) aligner = GlobalSequenceAligner(scoring, -1) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) encoded = encodeds[0] #Score based only on hits vs misses, insertions are ignored notInsert = encoded[:][0] != 0 nonInsertMatched = encoded[notInsert][:] #Find the alignment in the target sequence aSeq = nonInsertMatched[:][0] bSeq = nonInsertMatched[:][1] #Label all items not aligned to the target as false hitlist = [] x = 0 for x in range(0, len(aEncoded) - len(aSeq) + 1): aChunk = aEncoded[x:x + len(aSeq)] #print aChunk if sum(aChunk - aSeq) == 0: break hitlist.extend([False] * (x)) hitlist.extend(list(aSeq - bSeq == 0)) hitlist.extend([False] * (len(aEncoded) - x - len(aSeq))) #Export the target aligned words of the source sequence bWords = np.zeros(len(aEncoded), int) bWords[x:x + len(bSeq)] = bSeq bWordOut = np.array(v.elements())[bWords].tolist() hits.append(hitlist) iwscore = sum(hitlist) * 100 / float(len(hitlist)) wscore = np.hstack([wscore, iwscore]) print bWordOut self.source_matchWords.append(bWordOut) self.hits = hits self.wscore = wscore
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from ebl.alignment.domain.sequence import NamedSequence from ebl.tests.factories.fragment import FragmentFactory signs = "X X ABZ001\nABZ002\nX X X\n" sequence = Sequence(["ABZ001", "#", "ABZ002", "#", "#"]) def test_of_signs() -> None: vocabulary = Vocabulary() name = 1234 named = NamedSequence.of_signs(name, signs, vocabulary) assert named.name == str(name) assert named.sequence == vocabulary.encodeSequence(sequence) def test_of_fragment() -> None: vocabulary = Vocabulary() fragment = FragmentFactory.build(signs=signs) named = NamedSequence.of_fragment(fragment, vocabulary) assert named.name == str(fragment.number) assert named.sequence == vocabulary.encodeSequence(sequence)
for i in range( len(sequence_family)): if sequence_family[i][0][0]=='[': name.append(sequence_family[i][0][1:-1]) spilt_pos.append(i) sequence=[] for i in spilt_pos: ss=sequence_family[i+1][0] for ii in range(i+2,i+9): ss=ss+sequence_family[ii][0] sequence.append(ss) #%% v = Vocabulary() sequence_encoded=[] for i in range(len(sequence)): sequence_encoded.append(v.encodeSequence(Sequence(split_sequence(sequence[i])))) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) Matrix=np.zeros(9*9).reshape(9,9) for i in range(len(sequence_encoded)): for j in range(i+1,len(sequence_encoded)): score, encodeds = aligner.align(sequence_encoded[i], sequence_encoded[j], backtrace=True) for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) score=np.floor((100-alignment.percentIdentity())* len(np.array(alignment))/100) print(i,'-',j,':',score)
reto = nw.global_align("CEELECANTH", "PELICAN") reto2 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener") reto3 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener", gap_open=-5, gap_extend=-2) #import seqanpy #print(seqanpy.align_global('ACCGGT', 'CCG')) from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) a = Sequence("(Westf.), Grevener".split()) b = Sequence("††††††(Westf.), Grevener".split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them.
def make_sequence(string: str) -> Sequence: return Sequence( collapse_spaces( replace_line_breaks(string).replace(UNCLEAR_OR_UNKNOWN_SIGN, " ")).split(" "))
if sequence_family[i][0][0] == '[': name.append(sequence_family[i][0][1:-1]) spilt_pos.append(i) sequence = [] for i in spilt_pos: ss = sequence_family[i + 1][0] for ii in range(i + 2, i + 9): ss = ss + sequence_family[ii][0] sequence.append(ss) #%% v = Vocabulary() sequence_encoded = [] for i in range(len(sequence)): sequence_encoded.append( v.encodeSequence(Sequence(split_sequence(sequence[i])))) scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) Matrix = np.zeros(9 * 9).reshape(9, 9) for i in range(len(sequence_encoded)): for j in range(i + 1, len(sequence_encoded)): score, encodeds = aligner.align(sequence_encoded[i], sequence_encoded[j], backtrace=True) for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) score = np.floor((100 - alignment.percentIdentity()) * len(np.array(alignment)) / 100) print(i, j, score)
from fastamasta import FastaReader from sequence import hamming_dist if __name__ == "__main__" data = [i for i in FastaReader("data/31.fas")] a, b = data[0][1], data[1][1] if len(a) > len(b): b += '-'* ((len(a) - len(b))+1) if len(b) > len(a): a += '-'* ((len(b) - len(a))+1) # Create sequences to be aligned. a = Sequence([i for i in a]) b = Sequence([i for i in b]) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(3, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded)
def test_utterance_transcriptions(self): print('Checking utterance transcriptions...') split_directory = self.corpus.split_directory() model_directory = self.trainer.align_directory with mp.Pool(processes=self.corpus.num_jobs) as pool: jobs = [(self, x) for x in range(self.corpus.num_jobs)] results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs] output = [p.get() for p in results] print('Utterance FSTs compiled!') print('Decoding utterances (this will take some time)...') results = [pool.apply_async(test_utterances_func, args=i) for i in jobs] output = [p.get() for p in results] print('Finished decoding utterances!') word_mapping = self.dictionary.reversed_word_mapping v = Vocabulary() errors = {} for job in range(self.corpus.num_jobs): text_path = os.path.join(split_directory, 'text.{}'.format(job)) texts = load_scp(text_path) aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job))) with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf: for utt, line in sorted(aligned_int.items()): text = [] for t in line: text.append(word_mapping[int(t)]) outf.write('{} {}\n'.format(utt, ' '.join(text))) ref_text = texts[utt] if len(text) < len(ref_text) - 7: insertions = [x for x in text if x not in ref_text] deletions = [x for x in ref_text if x not in text] else: aligned_seq = Sequence(text) ref_seq = Sequence(ref_text) alignedEncoded = v.encodeSequence(aligned_seq) refEncoded = v.encodeSequence(ref_seq) scoring = SimpleScoring(2, -1) a = GlobalSequenceAligner(scoring, -2) score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True) insertions = [] deletions = [] for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) for i, f in enumerate(alignment.first): s = alignment.second[i] if f == '-': insertions.append(s) if s == '-': deletions.append(f) if insertions or deletions: errors[utt] = (insertions, deletions, ref_text, text) if not errors: message = 'There were no utterances with transcription issues.' else: out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv') with open(out_path, 'w') as problemf: problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n') for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(), key=lambda x: -1 * ( len(x[1][1]) + len(x[1][2]))): problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions), ' '.join(ref_text), ' '.join(text))) message = 'There were {} of {} utterances with at least one transcription issue. '\ 'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path) print(self.transcription_analysis_template.format(message))
########## SIMPLEST ####### import regex regex.search(r'\b(amazing){e<2}\s', 'is life amazing lie ao a') from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('amazing'.split()) b = Sequence('what a amazing disappointingly bad day'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print alignment print 'Alignment score:', alignment.score print 'Percent identity:', alignment.percentIdentity() print
speakers.update(bnc_cache[bnc_code][0]) _, recording_data, transcripts = bnc_cache[bnc_code] transcript = transcripts[r_code] try: tg = TextGrid(strict=False) tg.read(tg_path) except Exception as e: print(out_path) print(e) continue word_tier = tg.getFirst('word') #print([x.mark for x in word_tier]) phone_tier = tg.getFirst('phone') trans_ind = 0 prev_oov = False a = Sequence([x[0] for x in transcript]) b = Sequence([x.mark for x in word_tier]) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded)
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner from alignment.profile import Profile from alignment.profilealigner import SoftScoring, GlobalProfileAligner # Create sequences to be aligned. a = Sequence('what a beautiful day'.split()) b = Sequence('what a disappointingly bad day'.split()) print 'Sequence A:', a print 'Sequence B:', b print # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) print 'Encoded A:', aEncoded print 'Encoded B:', bEncoded print # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True) # Create sequence profiles out of alignments. profiles = [Profile.fromSequenceAlignment(a) for a in alignments] for encoded in profiles: profile = v.decodeProfile(encoded) print profile
pdist = pairwise_distances(embed, metric="cosine") triu_inds = np.triu_indices_from(pdist, k=1) all_path_dists = pdist[triu_inds] med = np.median(all_path_dists) # %% [markdown] # ## # from skbio.sequence import Sequence from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary seqs = [] for p in paths: s = Sequence(p) seqs.append(s) v = Vocabulary() encoded_seqs = [v.encodeSequence(s) for s in seqs] class SimpleScoring: def __init__(self, matchScore, mismatchScore): self.matchScore = matchScore self.mismatchScore = mismatchScore def __call__(self, firstElement, secondElement): if firstElement == secondElement: return self.matchScore else:
def text_to_text_alignment_and_score(text_ref, text_pred): """ Find a word to word alignment between two texts, considering the first is the reference and the second the predicted :param text_ref: text reference :param text_pred: predicted text :return: """ text_ref = text_ref.lower() text_pred = text_pred.lower() iterable = [".", ","] # convert the reference text in order not to contain , and (junk characters) translation_map = str.maketrans(to_translation_map(iterable)) text_ref = text_ref.translate(translation_map) # Create sequences to be aligned. a = Sequence(text_ref.split()) b = Sequence(text_pred.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() a_enc = v.encodeSequence(a) b_enc = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(1, 0) aligner = GlobalSequenceAligner(scoring, 0) f, score, encodeds = aligner.align(a_enc, b_enc, text_ref.split(), text_pred.split(), backtrace=True) # get the first alignment if exists: #print(encodeds[0]) print(encodeds) if len(encodeds[0]) > 0: alignment = v.decodeSequenceAlignment(encodeds[0]) print(alignment) ##fix first and last missing words of asr text list_asr = [] list_pred = [] for word in text_pred.split(): if word != alignment.second.elements[0]: list_asr.append(word) list_pred.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_pred.split()): if word != alignment.second.elements[-1]: list_asr = [word] + list_asr list_pred.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #fix first and last missing words of reference text list_asr = [] list_pred = [] for word in text_ref.split(): if word != alignment.first.elements[0]: list_pred.append(word) list_asr.append('-') else: alignment.second.elements = list_asr + alignment.second.elements alignment.first.elements = list_pred + alignment.first.elements break list_asr = [] list_pred = [] for word in reversed(text_ref.split()): if word != alignment.first.elements[-1]: list_pred = [word] + list_asr list_asr.append('-') else: alignment.second.elements = alignment.second.elements + list_asr alignment.first.elements = alignment.first.elements + list_pred break #print(alignment.second.elements) #print(alignment.first.elements) print(alignment) rec = alignment.score * 100 / len(text_ref.split()) pre = alignment.score * 100 / len(text_pred.split()) else: alignment = [] rec, pre = 0, 0 return alignment, rec, pre
from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner # Create sequences to be aligned. a = Sequence('1 1'.split()) b = Sequence('1 2 2 2'.split()) # Create a vocabulary and encode the sequences. v = Vocabulary() aEncoded = v.encodeSequence(a) bEncoded = v.encodeSequence(b) # Create a scoring and align the sequences using global aligner. scoring = SimpleScoring(2, -1) aligner = GlobalSequenceAligner(scoring, -2) score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True) # Iterate over optimal alignments and print them. for encoded in encodeds: alignment = v.decodeSequenceAlignment(encoded) print(alignment) print('Alignment score:', alignment.score) print('Percent identity:', alignment.percentIdentity()) print() from dtpattern.alignment.align3 import Needleman, Hirschberg seqa = list('1112') seqb = list('1222') # Align using Needleman-Wunsch algorithm.