Esempio n. 1
0
	def align(self, seq1, seq2):
		"""
		Goal: Align seq2 (automatically detected conditions) with seq1 (truth conditions) and return the best alignment
		"""
		print("len(truth_conditions) = {}, len(detected_conditions) = {}".format(len(seq1), len(seq2)))
		from alignment.sequence import Sequence
		from alignment.vocabulary import Vocabulary
		from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

		# Create sequences to be aligned.
		a = Sequence(seq1)
		b = Sequence(seq2)

		# Create a vocabulary and encode the sequences.
		v = Vocabulary()
		aEncoded = v.encodeSequence(a)
		bEncoded = v.encodeSequence(b)

		# Create a scoring and align the sequences using global aligner.
		scoring = SimpleScoring(2, -1)
		aligner = GlobalSequenceAligner(scoring, -2)
		score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

		# Iterate over optimal alignments and print them.
		for encoded in encodeds:
			alignment = v.decodeSequenceAlignment(encoded)
			print alignment
			print 'Alignment score:', alignment.score
			print 'Percent identity:', alignment.percentIdentity()
			assert(alignment.percentIdentity() >= 97.0)
			first, second = list(alignment.first), list(alignment.second)
			break
		return second
Esempio n. 2
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Esempio n. 3
0
def align(sequence1, sequence2):

    # This is encoded because the aligner uses the dasy as a gap element
    sequence1 = ['<DASH />' if word == '-' else word for word in sequence1]
    sequence2 = ['<DASH />' if word == '-' else word for word in sequence2]

    # Create sequences to be aligned.
    a = Sequence(sequence1)
    b = Sequence(sequence2)

    #print(22)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    #print(33)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)

    #print(99)

    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    #print(34)

    # Create alignment object and return it
    alignment = v.decodeSequenceAlignment(encodeds[0])
    return alignment
Esempio n. 4
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
def align(trace1, trace2):

    # Create sequences to be aligned.
    a = Sequence(trace1)
    b = Sequence(trace2)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    # Iterate over optimal alignments and print them.
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        align = str(alignment)

    # Convert aligned sequences into list
    # escaping multiple characters

    seq_size = len(align)
    half_size = seq_size / 2

    # First Half
    s1 = align_to_list(align, 0, half_size, 4)

    # Second Half
    s2 = align_to_list(align, half_size, seq_size, 4)

    # return the lists as result
    return s1, s2
Esempio n. 6
0
def score_align(x, y):
    a = Sequence(x)
    b = Sequence(y)
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    pI = 0.0
    for e in encodeds:
        alignment = v.decodeSequenceAlignment(e)
        pI = max(pI, alignment.percentIdentity())
    return 1 - pI/100.0
Esempio n. 7
0
def align_pair(
    first: NamedSequence,
    second: NamedSequence,
    vocabulary: Vocabulary,
) -> AlignmentResult:
    scoring = EblScoring(vocabulary)
    aligner = GlobalSequenceAligner(scoring, True)
    score, alignments = aligner.align(first.sequence, second.sequence, backtrace=True)
    return AlignmentResult(
        score,
        first,
        second,
        [vocabulary.decodeSequenceAlignment(encoded) for encoded in alignments],
    )
 def score_align(x, y):
     a = Sequence(x)
     b = Sequence(y)
     v = Vocabulary()
     aEncoded = v.encodeSequence(a)
     bEncoded = v.encodeSequence(b)
     scoring = SimpleScoring(2, -1)
     aligner = GlobalSequenceAligner(scoring, -2)
     score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
     pI = 0.0
     for e in encodeds:
         alignment = v.decodeSequenceAlignment(e)
         pI = max(pI, alignment.percentIdentity())
     return 1 - pI / 100.0
Esempio n. 9
0
def getAlignment(timit, utterance):
    tim = list()
    for li in timit:
        for ph in li:
            tim.append(ph)
    a=Sequence(tim)
    b=Sequence(utterance)
    v=Vocabulary()
    aEnc=v.encodeSequence(a)
    bEnc=v.encodeSequence(b)
    scoring=SimpleScoring(2,-1)
    aligner=GlobalSequenceAligner(scoring,-2)
    score,encodeds= aligner.align(aEnc,bEnc,backtrace=True)
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        return alignment
Esempio n. 10
0
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 | set2
    try:
        common_words.remove("")
    except:
        pass

    words1 = []
    words2 = []
    for word in common_words:
        words1 += index_word_pairs(word, list1)
        words2 += index_word_pairs(word, list2)
    sorted1 = sorted(words1, key=lambda t: t[1])
    sorted2 = sorted(words2, key=lambda t: t[1])

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        #print alignment
        #print 'Alignment score:', alignment.score
        #print 'Percent identity:', alignment.percentIdentity()
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
Esempio n. 11
0
def recommendation(name, movies):
    """Find the top ten neartest match in a list of movie names
    
    Args:
        name: a string of key words seperated by white space
        dic: a list of movie names to choose from
    
    Returns:
        A list of movie names
    """
    # Create sequences to be aligned.
    key = Sequence(name.split())
    dic = [Sequence(movie.split()) for movie in movies]
    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    keyEncoded = v.encodeSequence(key)
    dicEncoded = [v.encodeSequence(movie) for movie in dic]
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, -2)
    score = [aligner.align(keyEncoded, dEncoded, backtrace=False) for dEncoded in dicEncoded]
    # Get the top five score in all movies 
    topFive = sorted(range(len(score)), key=lambda i:score[i])
    return [ movies[i] for i in topFive ]
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 & set2
    try:
        common_words.remove("")
    except:
        pass

    words_to_index = {}
    for word in common_words:
        in1 = list1.index(word)
        in2 = list2.index(word)
        words_to_index[word] = (in1, in2)
    sorted1 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][0])).keys()
    sorted2 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][1])).keys()

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
Esempio n. 13
0
b = Sequence('what a disappointingly bad day'.split())
print 'Sequence A:', a
print 'Sequence B:', b
print

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
print 'Encoded A:', aEncoded
print 'Encoded B:', bEncoded
print

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True)

# Create sequence profiles out of alignments.
profiles = [Profile.fromSequenceAlignment(a) for a in alignments]
for encoded in profiles:
    profile = v.decodeProfile(encoded)
    print profile
print

# Create a soft scoring and align the first profile against sequence A.
scoring = SoftScoring(scoring)
aligner = GlobalProfileAligner(scoring, -2)
score, alignments = aligner.align(profiles[0],
                                  Profile.fromSequence(aEncoded),
                                  backtrace=True)
Esempio n. 14
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('amazing'.split())
b = Sequence('what a amazing disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print


from alignment.sequence import Sequence, GAP_ELEMENT
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, LocalSequenceAligner
Esempio n. 15
0
# Tests -----------------------------------------------------------------------

if __name__ == '__main__':
    s1 = Sequence('what a beautiful day'.split())
    s2 = Sequence('what a disappointingly bad day'.split())
    print('s1', s1)
    print('s2', s2)
    print('')

    from alignment.vocabulary import Vocabulary
    v = Vocabulary()
    e1 = v.encodeSequence(s1)
    e2 = v.encodeSequence(s2)
    print('v', v)
    print('e1', e1)
    print('e2', e2)
    print('')

    from alignment.sequencealigner import SimpleScoring
    from alignment.sequencealigner import GlobalSequenceAligner
    s = SimpleScoring(2, -1)
    a = GlobalSequenceAligner(s, -2)
    score, alignments = a.align(e1, e2, backtrace=True)
    for alignment in alignments:
        as1 = v.decodeSequence(alignment.first)
        as2 = v.decodeSequence(alignment.second)
        print(alignment.percentIdentity())
        print(as1)
        print(as2)
        print('')
Esempio n. 16
0
    def ScorePhonemes(self, source=[], target=[]):
        """Compare the phonemes of a source and target sentence and determine 
        which of the target items were correctly transcribed
    
        Returns:
            hits_phonemes (nested list): list of bools corresponding to the accuracy
            of each phoneme in the target list for each sentence
        Note:
        This scoring method has no word accuracy awareness. Phonemes from correctly input
        words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: )
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)        
        """
        if not source:
            source = self.source_phonemes
        if not target:
            target = self.target_phonemes

        self.source_matched = []
        hits = []
        for x, ttup in enumerate(target):
            tphon, twordnum, tword = zip(*ttup)
            stup = source[x]
            if not stup:
                hitlist = [False] * len(tphon)
                bPhonOut = ['-'] * len(tphon)
            else:
                sphon, swordnum, sword = zip(*stup)
                # Create sequences to be aligned.
                a = Sequence(tphon)
                b = Sequence(sphon)

                # Create a vocabulary and encode the sequences.
                v = Vocabulary()
                aEncoded = v.encodeSequence(a)
                bEncoded = v.encodeSequence(b)

                # Create a scoring and align the sequences using global aligner.
                scoring = SimpleScoring(2, -1)
                aligner = GlobalSequenceAligner(scoring, -2)
                score, encodeds = aligner.align(aEncoded,
                                                bEncoded,
                                                backtrace=True)
                encoded = encodeds[0]

                #Score based only on hits vs misses, insertions are ignored
                notInsert = encoded[:][0] != 0
                nonInsertMatched = encoded[notInsert][:]

                #Find the alignment in the target sequence
                aSeq = nonInsertMatched[:][0]
                bSeq = nonInsertMatched[:][1]

                #Label all items not aligned to the target as false
                hitlist = []
                y = 0
                for y in range(0, len(aEncoded) - len(aSeq) + 1):
                    aChunk = aEncoded[y:y + len(aSeq)]
                    #print aChunk
                    if sum(aChunk - aSeq) == 0:
                        break
                hitlist.extend([False] * (y))
                hitlist.extend(list(aSeq - bSeq == 0))
                hitlist.extend([False] * (len(aEncoded) - y - len(aSeq)))
                #Export the target aligned phonemes of the source sequence
                bPhons = np.zeros(len(aEncoded), int)
                bPhons[y:y + len(bSeq)] = bSeq
                bPhonOut = np.array(v.elements())[bPhons].tolist()
            hits.append(hitlist)
            self.source_matched.append(bPhonOut)
            self.hits_phonemes = hits
Esempio n. 17
0
    def ScoreWords(self):
        """Aligns the words of the source sentence to match the target sentence
        to determine hit vs missed words
    
        Returns:
           hits (nested list): The target [0] and source [1] sentences in a nested list 
    
        Note:
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)
        """
        target = self.target
        source = self.source
        self.source_matchWords = []
        hits = []
        wscore = np.empty(0)
        for tnum, tsent in enumerate(target):
            ssent = source[tnum]
            # Create sequences to be aligned.
            a = Sequence(tsent.split())
            b = Sequence(ssent.split())

            # Create a vocabulary and encode the sequences.
            v = Vocabulary()
            aEncoded = v.encodeSequence(a)
            bEncoded = v.encodeSequence(b)

            # Create a scoring and align the sequences using global aligner.
            scoring = SimpleScoring(5, -1)
            aligner = GlobalSequenceAligner(scoring, -1)
            score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
            encoded = encodeds[0]

            #Score based only on hits vs misses, insertions are ignored
            notInsert = encoded[:][0] != 0
            nonInsertMatched = encoded[notInsert][:]

            #Find the alignment in the target sequence
            aSeq = nonInsertMatched[:][0]
            bSeq = nonInsertMatched[:][1]

            #Label all items not aligned to the target as false
            hitlist = []
            x = 0
            for x in range(0, len(aEncoded) - len(aSeq) + 1):
                aChunk = aEncoded[x:x + len(aSeq)]
                #print aChunk
                if sum(aChunk - aSeq) == 0:
                    break
            hitlist.extend([False] * (x))
            hitlist.extend(list(aSeq - bSeq == 0))
            hitlist.extend([False] * (len(aEncoded) - x - len(aSeq)))
            #Export the target aligned words of the source sequence
            bWords = np.zeros(len(aEncoded), int)
            bWords[x:x + len(bSeq)] = bSeq
            bWordOut = np.array(v.elements())[bWords].tolist()
            hits.append(hitlist)
            iwscore = sum(hitlist) * 100 / float(len(hitlist))
            wscore = np.hstack([wscore, iwscore])
            print bWordOut
            self.source_matchWords.append(bWordOut)
            self.hits = hits
            self.wscore = wscore
Esempio n. 18
0
sequence = []
for i in spilt_pos:
    ss = sequence_family[i + 1][0]
    for ii in range(i + 2, i + 9):
        ss = ss + sequence_family[ii][0]
    sequence.append(ss)

#%%
v = Vocabulary()
sequence_encoded = []
for i in range(len(sequence)):
    sequence_encoded.append(
        v.encodeSequence(Sequence(split_sequence(sequence[i]))))

scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)

Matrix = np.zeros(9 * 9).reshape(9, 9)
for i in range(len(sequence_encoded)):
    for j in range(i + 1, len(sequence_encoded)):
        score, encodeds = aligner.align(sequence_encoded[i],
                                        sequence_encoded[j],
                                        backtrace=True)
        for encoded in encodeds:
            alignment = v.decodeSequenceAlignment(encoded)
            score = np.floor((100 - alignment.percentIdentity()) *
                             len(np.array(alignment)) / 100)
            print(i, j, score)
        Matrix[i, j] = score

Matrix = Matrix.T
Esempio n. 19
0
def text_to_text_alignment_and_score(text_ref, text_pred):
    """
    Find a word to word alignment between two texts, considering the first is 
    the reference and the second the predicted
    :param text_ref: text reference
    :param text_pred: predicted text
    :return: 
    """

    text_ref = text_ref.lower()
    text_pred = text_pred.lower()
    iterable = [".", ","]
    # convert the reference text in order not to contain , and (junk characters)
    translation_map = str.maketrans(to_translation_map(iterable))
    text_ref = text_ref.translate(translation_map)

    # Create sequences to be aligned.
    a = Sequence(text_ref.split())
    b = Sequence(text_pred.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    a_enc = v.encodeSequence(a)
    b_enc = v.encodeSequence(b)
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, 0)
    f, score, encodeds = aligner.align(a_enc,
                                       b_enc,
                                       text_ref.split(),
                                       text_pred.split(),
                                       backtrace=True)

    # get the first alignment if exists:
    #print(encodeds[0])
    print(encodeds)

    if len(encodeds[0]) > 0:
        alignment = v.decodeSequenceAlignment(encodeds[0])
        print(alignment)
        ##fix first and last missing words of asr text
        list_asr = []
        list_pred = []
        for word in text_pred.split():
            if word != alignment.second.elements[0]:
                list_asr.append(word)
                list_pred.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_pred.split()):
            if word != alignment.second.elements[-1]:
                list_asr = [word] + list_asr
                list_pred.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #fix first and last missing words of reference text
        list_asr = []
        list_pred = []
        for word in text_ref.split():
            if word != alignment.first.elements[0]:
                list_pred.append(word)
                list_asr.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_ref.split()):
            if word != alignment.first.elements[-1]:
                list_pred = [word] + list_asr
                list_asr.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #print(alignment.second.elements)
        #print(alignment.first.elements)
        print(alignment)
        rec = alignment.score * 100 / len(text_ref.split())
        pre = alignment.score * 100 / len(text_pred.split())
    else:
        alignment = []
        rec, pre = 0, 0

    return alignment, rec, pre
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
Esempio n. 21
0
def score(aEncoded, bEncoded, v):
  scoring = SimpleScoring(1, -3)
  aligner = GlobalSequenceAligner(scoring, 0)
  #aligner = LocalSequenceAligner(scoring, -1)
  score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
  return score, encodeds
Esempio n. 22
0
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
Esempio n. 23
0
b = Sequence('what a disappointingly bad day'.split())
print 'Sequence A:', a
print 'Sequence B:', b
print

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
print 'Encoded A:', aEncoded
print 'Encoded B:', bEncoded
print

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True)

# Create sequence profiles out of alignments.
profiles = [Profile.fromSequenceAlignment(a) for a in alignments]
for encoded in profiles:
    profile = v.decodeProfile(encoded)
    print profile
print

# Create a soft scoring and align the first profile against sequence A.
scoring = SoftScoring(scoring)
aligner = GlobalProfileAligner(scoring, -2)
score, alignments = aligner.align(profiles[0], Profile.fromSequence(aEncoded),
                                  backtrace=True)
for encoded in alignments:
Esempio n. 24
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print

Esempio n. 25
0
# Tests -----------------------------------------------------------------------

if __name__ == '__main__':
    s1 = Sequence('what a beautiful day'.split())
    s2 = Sequence('what a disappointingly bad day'.split())
    print('s1', s1)
    print('s2', s2)
    print('')

    from alignment.vocabulary import Vocabulary
    v = Vocabulary()
    e1 = v.encodeSequence(s1)
    e2 = v.encodeSequence(s2)
    print('v', v)
    print('e1', e1)
    print('e2', e2)
    print('')

    from alignment.sequencealigner import SimpleScoring
    from alignment.sequencealigner import GlobalSequenceAligner
    s = SimpleScoring(2, -1)
    a = GlobalSequenceAligner(s, -2)
    score, alignments = a.align(e1, e2, backtrace=True)
    for alignment in alignments:
        as1 = v.decodeSequence(alignment.first)
        as2 = v.decodeSequence(alignment.second)
        print(alignment.percentIdentity())
        print(as1)
        print(as2)
        print('')
Esempio n. 26
0
        return self.dist_mat[first, second]


from alignment.sequencealigner import GlobalSequenceAligner

choice_inds = np.random.choice(len(seqs), int(1e3), replace=False)
new_seqs = []
for i, s in enumerate(seqs):
    if i in choice_inds:
        new_seqs.append(s)

seqs = new_seqs

nw_scores = np.zeros((len(seqs), len(seqs)))

aligner = GlobalSequenceAligner(DistScoring(pdist), 1000 - med * 1000)
for i in tqdm(range(len(seqs))):
    for j in range(i, len(seqs)):
        score, encodeds = aligner.align(seqs[i], seqs[j], backtrace=True)
        s = score / (1000 * max(len(seqs[i]), len(seqs[j])))
        nw_scores[i, j] = s

# %% [markdown]
# ##
from graspy.utils import symmetrize

sns.heatmap(nw_scores)
nw_scores = symmetrize(nw_scores, "triu")
nw_dists = 1 - nw_scores
# %% [markdown]
# ##