def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2):
    '''
    args:
        a: list of words
        b: list of words
        matchScore: num
        mismatchScore: num
        gapScore: num
    Returns:
        o/w returns list of tuples with score and top alignments
    Description:
        helper function for finding alignments given a list of words
    '''
    # Create a vocabulary and encode the sequences.
    a = a[0]
    b = b[0]
    seq1 = Sequence(a)
    seq2 = Sequence(b)
    v = Vocabulary()
    aEncoded = v.encodeSequence(seq1)
    bEncoded = v.encodeSequence(seq2)

    # Create a scoring and align the sequences using local aligner.
    scoring = SimpleScoring(matchScore, mismatchScore)
    aligner = LocalSequenceAligner(scoring, gapScore)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds]

    return [(a.score, list(a.first), list(a.second)) for a in alignments]
Exemple #2
0
	def align(self, seq1, seq2):
		"""
		Goal: Align seq2 (automatically detected conditions) with seq1 (truth conditions) and return the best alignment
		"""
		print("len(truth_conditions) = {}, len(detected_conditions) = {}".format(len(seq1), len(seq2)))
		from alignment.sequence import Sequence
		from alignment.vocabulary import Vocabulary
		from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

		# Create sequences to be aligned.
		a = Sequence(seq1)
		b = Sequence(seq2)

		# Create a vocabulary and encode the sequences.
		v = Vocabulary()
		aEncoded = v.encodeSequence(a)
		bEncoded = v.encodeSequence(b)

		# Create a scoring and align the sequences using global aligner.
		scoring = SimpleScoring(2, -1)
		aligner = GlobalSequenceAligner(scoring, -2)
		score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

		# Iterate over optimal alignments and print them.
		for encoded in encodeds:
			alignment = v.decodeSequenceAlignment(encoded)
			print alignment
			print 'Alignment score:', alignment.score
			print 'Percent identity:', alignment.percentIdentity()
			assert(alignment.percentIdentity() >= 97.0)
			first, second = list(alignment.first), list(alignment.second)
			break
		return second
Exemple #3
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Exemple #4
0
def align_ref_long(hyp, ref):
    ''' Aligns a ref to a sausage-aligned hype 
        using the align library '''
    
    # align ref to hyp
    sr = Sequence(ref)
    sh = Sequence(hyp)
    
    v = Vocabulary()
    rEncoded = v.encodeSequence(sr)
    hEncoded = v.encodeSequence(sh)
    
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = StrictGlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(hEncoded, rEncoded, backtrace=True)
    

    # Iterate over optimal alignments and print them.
    alignment = v.decodeSequenceAlignment(encodeds[0])
    ref_align_raw = [token[0] for token in alignment if token[0] != '-']
    
    ref_align  = []
    for token in ref_align_raw:
        if token == '-':
            ref_align.append(delete_token)
        else:
            ref_align.append(token)
            
    for i in range(len(hyp) - len(ref_align_raw)):
        ref_align.append(delete_token)
        
    return ref_align
Exemple #5
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Exemple #6
0
def align(sequence1, sequence2):

    # This is encoded because the aligner uses the dasy as a gap element
    sequence1 = ['<DASH />' if word == '-' else word for word in sequence1]
    sequence2 = ['<DASH />' if word == '-' else word for word in sequence2]

    # Create sequences to be aligned.
    a = Sequence(sequence1)
    b = Sequence(sequence2)

    #print(22)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    #print(33)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)

    #print(99)

    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    #print(34)

    # Create alignment object and return it
    alignment = v.decodeSequenceAlignment(encodeds[0])
    return alignment
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a,b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
def align(trace1, trace2):

    # Create sequences to be aligned.
    a = Sequence(trace1)
    b = Sequence(trace2)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    # Iterate over optimal alignments and print them.
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        align = str(alignment)

    # Convert aligned sequences into list
    # escaping multiple characters

    seq_size = len(align)
    half_size = seq_size / 2

    # First Half
    s1 = align_to_list(align, 0, half_size, 4)

    # Second Half
    s2 = align_to_list(align, half_size, seq_size, 4)

    # return the lists as result
    return s1, s2
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a, b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
Exemple #10
0
def align_pair(
    first: NamedSequence,
    second: NamedSequence,
    vocabulary: Vocabulary,
) -> AlignmentResult:
    scoring = EblScoring(vocabulary)
    aligner = GlobalSequenceAligner(scoring, True)
    score, alignments = aligner.align(first.sequence, second.sequence, backtrace=True)
    return AlignmentResult(
        score,
        first,
        second,
        [vocabulary.decodeSequenceAlignment(encoded) for encoded in alignments],
    )
 def score_align(x, y):
     a = Sequence(x)
     b = Sequence(y)
     v = Vocabulary()
     aEncoded = v.encodeSequence(a)
     bEncoded = v.encodeSequence(b)
     scoring = SimpleScoring(2, -1)
     aligner = GlobalSequenceAligner(scoring, -2)
     score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
     pI = 0.0
     for e in encodeds:
         alignment = v.decodeSequenceAlignment(e)
         pI = max(pI, alignment.percentIdentity())
     return 1 - pI / 100.0
def score_align(x, y):
    a = Sequence(x)
    b = Sequence(y)
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    pI = 0.0
    for e in encodeds:
        alignment = v.decodeSequenceAlignment(e)
        pI = max(pI, alignment.percentIdentity())
    return 1 - pI/100.0
Exemple #13
0
    def align(self, word, error):
        vocab = Vocabulary()
        a = vocab.encodeSequence(Sequence(word))
        b = vocab.encodeSequence(Sequence(error))
        score, encodings = self.aligner.align(a, b, backtrace=True)
    
        # Choose the highest-score alignment.
        score = -sys.maxsize
        best_alignment = None
        for encoding in encodings:
            alignment = vocab.decodeSequenceAlignment(encoding)
            if alignment.score > score:
                best_alignment = alignment
                score = alignment.score

        return best_alignment.first, best_alignment.second
Exemple #14
0
def getAlignment(timit, utterance):
    tim = list()
    for li in timit:
        for ph in li:
            tim.append(ph)
    a=Sequence(tim)
    b=Sequence(utterance)
    v=Vocabulary()
    aEnc=v.encodeSequence(a)
    bEnc=v.encodeSequence(b)
    scoring=SimpleScoring(2,-1)
    aligner=GlobalSequenceAligner(scoring,-2)
    score,encodeds= aligner.align(aEnc,bEnc,backtrace=True)
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        return alignment
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 & set2
    try:
        common_words.remove("")
    except:
        pass

    words_to_index = {}
    for word in common_words:
        in1 = list1.index(word)
        in2 = list2.index(word)
        words_to_index[word] = (in1, in2)
    sorted1 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][0])).keys()
    sorted2 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][1])).keys()

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
Exemple #16
0
def text_to_text_alignment_and_score(text_ref, text_pred):
    """
    Find a word to word alignment between two texts, considering the first is 
    the reference and the second the predicted
    :param text_ref: text reference
    :param text_pred: predicted text
    :return: 
    """

    text_ref = text_ref.lower()
    text_pred = text_pred.lower()
    iterable = [".", ","]
    # convert the reference text in order not to contain , and (junk characters)
    translation_map = str.maketrans(to_translation_map(iterable))
    text_ref = text_ref.translate(translation_map)

    # Create sequences to be aligned.
    a = Sequence(text_ref.split())
    b = Sequence(text_pred.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    a_enc = v.encodeSequence(a)
    b_enc = v.encodeSequence(b)
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, 0)
    f, score, encodeds = aligner.align(a_enc,
                                       b_enc,
                                       text_ref.split(),
                                       text_pred.split(),
                                       backtrace=True)

    # get the first alignment if exists:
    #print(encodeds[0])
    print(encodeds)

    if len(encodeds[0]) > 0:
        alignment = v.decodeSequenceAlignment(encodeds[0])
        print(alignment)
        ##fix first and last missing words of asr text
        list_asr = []
        list_pred = []
        for word in text_pred.split():
            if word != alignment.second.elements[0]:
                list_asr.append(word)
                list_pred.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_pred.split()):
            if word != alignment.second.elements[-1]:
                list_asr = [word] + list_asr
                list_pred.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #fix first and last missing words of reference text
        list_asr = []
        list_pred = []
        for word in text_ref.split():
            if word != alignment.first.elements[0]:
                list_pred.append(word)
                list_asr.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_ref.split()):
            if word != alignment.first.elements[-1]:
                list_pred = [word] + list_asr
                list_asr.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #print(alignment.second.elements)
        #print(alignment.first.elements)
        print(alignment)
        rec = alignment.score * 100 / len(text_ref.split())
        pre = alignment.score * 100 / len(text_pred.split())
    else:
        alignment = []
        rec, pre = 0, 0

    return alignment, rec, pre
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
a = Sequence('amazing'.split())
b = Sequence('what a amazing disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print


from alignment.sequence import Sequence, GAP_ELEMENT
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner

large_string = "thelargemanhatanproject is a great project in themanhattincity"
query_string = "manhattan"

# Create sequences to be aligned.
a = Sequence(large_string)
Exemple #19
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print

    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))