def align(trace1, trace2):

    # Create sequences to be aligned.
    a = Sequence(trace1)
    b = Sequence(trace2)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    # Iterate over optimal alignments and print them.
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
        align = str(alignment)

    # Convert aligned sequences into list
    # escaping multiple characters

    seq_size = len(align)
    half_size = seq_size / 2

    # First Half
    s1 = align_to_list(align, 0, half_size, 4)

    # Second Half
    s2 = align_to_list(align, half_size, seq_size, 4)

    # return the lists as result
    return s1, s2
Exemple #2
0
def align(s1, s2):
    # Create sequences to be aligned.
    a = Sequence(s1.split())
    b = Sequence(s2.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    encoded = encodeds[0]
    alignment = v.decodeSequenceAlignment(encoded)
    correct_words = []
    offset = 0
    for i, (x, y) in enumerate(encoded):
        if x == y:
            correct_words.append(a[i - offset])
        elif x == 0:
            offset += 1

    return correct_words
Exemple #3
0
def align(seq1, seq2):
  s1 = Sequence(seq1)
  s2 = Sequence(seq2)
  v = Vocabulary()
  s1Encoded = v.encodeSequence(s1)
  s2Encoded = v.encodeSequence(s2)
  return s1Encoded, s2Encoded, v
    def get_labels(self):
        """label each slot in the sausage (O=correct X=incorrect)"""
        if self.correct():
            # everything is correct
            return ['O'] * self.num_slots()

        # align the ref and the best hyp
        a = Sequence(self.ref())
        b = Sequence(self.best_hyp())
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)
        scoring = SimpleScoring(2, -1)
        aligner = StrictGlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
        alignment = v.decodeSequenceAlignment(encodeds[0])

        # get labels according to alignment
        labels = []
        for a, b in zip(alignment.first, alignment.second):
            if a == b or a == '-' and b == '*DELETE*':
                labels.append('O')
            else:
                labels.append('X')
        return labels
def seqToAlign(a, b, matchScore=3, mismatchScore=-1, gapScore=-2):
    '''
    args:
        a: list of words
        b: list of words
        matchScore: num
        mismatchScore: num
        gapScore: num
    Returns:
        o/w returns list of tuples with score and top alignments
    Description:
        helper function for finding alignments given a list of words
    '''
    # Create a vocabulary and encode the sequences.
    a = a[0]
    b = b[0]
    seq1 = Sequence(a)
    seq2 = Sequence(b)
    v = Vocabulary()
    aEncoded = v.encodeSequence(seq1)
    bEncoded = v.encodeSequence(seq2)

    # Create a scoring and align the sequences using local aligner.
    scoring = SimpleScoring(matchScore, mismatchScore)
    aligner = LocalSequenceAligner(scoring, gapScore)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
    alignments = [v.decodeSequenceAlignment(encoded) for encoded in encodeds]

    return [(a.score, list(a.first), list(a.second)) for a in alignments]
Exemple #6
0
def align(sequence1, sequence2):

    # This is encoded because the aligner uses the dasy as a gap element
    sequence1 = ['<DASH />' if word == '-' else word for word in sequence1]
    sequence2 = ['<DASH />' if word == '-' else word for word in sequence2]

    # Create sequences to be aligned.
    a = Sequence(sequence1)
    b = Sequence(sequence2)

    #print(22)

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    #print(33)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(2, -1)
    aligner = GlobalSequenceAligner(scoring, -2)

    #print(99)

    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    #print(34)

    # Create alignment object and return it
    alignment = v.decodeSequenceAlignment(encodeds[0])
    return alignment
 def score_align(x, y):
     a = Sequence(x)
     b = Sequence(y)
     v = Vocabulary()
     aEncoded = v.encodeSequence(a)
     bEncoded = v.encodeSequence(b)
     scoring = SimpleScoring(2, -1)
     aligner = GlobalSequenceAligner(scoring, -2)
     score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
     pI = 0.0
     for e in encodeds:
         alignment = v.decodeSequenceAlignment(e)
         pI = max(pI, alignment.percentIdentity())
     return 1 - pI / 100.0
Exemple #8
0
def align_sequences(seq_a, seq_b):
    # Must escape '-' because alignment library uses it as a gap
    # marker.
    escaped_seq_a = ['\\-' if x == '-' else x for x in seq_a]
    escaped_seq_b = ['\\-' if x == '-' else x for x in seq_b]

    v = Vocabulary()
    encoded_a = v.encodeSequence(Sequence(escaped_seq_a))
    encoded_b = v.encodeSequence(Sequence(escaped_seq_b))

    scoring = SimpleScoring(matchScore=3, mismatchScore=-1)
    aligner = StrictGlobalSequenceAligner(scoring, gapScore=-2)
    _, encodeds = aligner.align(encoded_a, encoded_b, backtrace=True)
    return encodeds[0]
Exemple #9
0
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 | set2
    try:
        common_words.remove("")
    except:
        pass

    words1 = []
    words2 = []
    for word in common_words:
        words1 += index_word_pairs(word, list1)
        words2 += index_word_pairs(word, list2)
    sorted1 = sorted(words1, key=lambda t: t[1])
    sorted2 = sorted(words2, key=lambda t: t[1])

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        #print alignment
        #print 'Alignment score:', alignment.score
        #print 'Percent identity:', alignment.percentIdentity()
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
def match_word_sorted(code1, code2):
    """return the max scored alignment between the two input codes"""
    list1 = code1.split(" ")
    list2 = code2.split(" ")
    set1 = set(list1)
    set2 = set(list2)
    common_words = set1 & set2
    try:
        common_words.remove("")
    except:
        pass

    words_to_index = {}
    for word in common_words:
        in1 = list1.index(word)
        in2 = list2.index(word)
        words_to_index[word] = (in1, in2)
    sorted1 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][0])).keys()
    sorted2 = OrderedDict(sorted(words_to_index.items(),
                                 key=lambda t: t[1][1])).keys()

    a = Sequence(sorted1)
    b = Sequence(sorted2)
    v = Vocabulary()
    a_encoded = v.encodeSequence(a)
    b_encoded = v.encodeSequence(b)
    scoring = SimpleScoring(MATCH_SCORE, MISMATCH_SCORE)
    aligner = GlobalSequenceAligner(scoring, GAP_SCORE)
    score, encoders = aligner.align(a_encoded, b_encoded, backtrace=True)
    max_score = 0
    for i, encoded in enumerate(encoders):
        alignment = v.decodeSequenceAlignment(encoded)
        if alignment.score > max_score:
            max_score = alignment.score
    return max_score
Exemple #11
0
    def ScorePhonemes(self, source=[], target=[]):
        """Compare the phonemes of a source and target sentence and determine 
        which of the target items were correctly transcribed
    
        Returns:
            hits_phonemes (nested list): list of bools corresponding to the accuracy
            of each phoneme in the target list for each sentence
        Note:
        This scoring method has no word accuracy awareness. Phonemes from correctly input
        words may wind up as labeled wrong ( i.e. target:"with the" source: "with a" alignement: )
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)        
        """
        if not source:
            source = self.source_phonemes
        if not target:
            target = self.target_phonemes

        self.source_matched = []
        hits = []
        for x, ttup in enumerate(target):
            tphon, twordnum, tword = zip(*ttup)
            stup = source[x]
            if not stup:
                hitlist = [False] * len(tphon)
                bPhonOut = ['-'] * len(tphon)
            else:
                sphon, swordnum, sword = zip(*stup)
                # Create sequences to be aligned.
                a = Sequence(tphon)
                b = Sequence(sphon)

                # Create a vocabulary and encode the sequences.
                v = Vocabulary()
                aEncoded = v.encodeSequence(a)
                bEncoded = v.encodeSequence(b)

                # Create a scoring and align the sequences using global aligner.
                scoring = SimpleScoring(2, -1)
                aligner = GlobalSequenceAligner(scoring, -2)
                score, encodeds = aligner.align(aEncoded,
                                                bEncoded,
                                                backtrace=True)
                encoded = encodeds[0]

                #Score based only on hits vs misses, insertions are ignored
                notInsert = encoded[:][0] != 0
                nonInsertMatched = encoded[notInsert][:]

                #Find the alignment in the target sequence
                aSeq = nonInsertMatched[:][0]
                bSeq = nonInsertMatched[:][1]

                #Label all items not aligned to the target as false
                hitlist = []
                y = 0
                for y in range(0, len(aEncoded) - len(aSeq) + 1):
                    aChunk = aEncoded[y:y + len(aSeq)]
                    #print aChunk
                    if sum(aChunk - aSeq) == 0:
                        break
                hitlist.extend([False] * (y))
                hitlist.extend(list(aSeq - bSeq == 0))
                hitlist.extend([False] * (len(aEncoded) - y - len(aSeq)))
                #Export the target aligned phonemes of the source sequence
                bPhons = np.zeros(len(aEncoded), int)
                bPhons[y:y + len(bSeq)] = bSeq
                bPhonOut = np.array(v.elements())[bPhons].tolist()
            hits.append(hitlist)
            self.source_matched.append(bPhonOut)
            self.hits_phonemes = hits
Exemple #12
0
    def ScoreWords(self):
        """Aligns the words of the source sentence to match the target sentence
        to determine hit vs missed words
    
        Returns:
           hits (nested list): The target [0] and source [1] sentences in a nested list 
    
        Note:
        Modified from Eser Aygün (https://pypi.python.org/pypi/alignment/1.0.9)
        """
        target = self.target
        source = self.source
        self.source_matchWords = []
        hits = []
        wscore = np.empty(0)
        for tnum, tsent in enumerate(target):
            ssent = source[tnum]
            # Create sequences to be aligned.
            a = Sequence(tsent.split())
            b = Sequence(ssent.split())

            # Create a vocabulary and encode the sequences.
            v = Vocabulary()
            aEncoded = v.encodeSequence(a)
            bEncoded = v.encodeSequence(b)

            # Create a scoring and align the sequences using global aligner.
            scoring = SimpleScoring(5, -1)
            aligner = GlobalSequenceAligner(scoring, -1)
            score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)
            encoded = encodeds[0]

            #Score based only on hits vs misses, insertions are ignored
            notInsert = encoded[:][0] != 0
            nonInsertMatched = encoded[notInsert][:]

            #Find the alignment in the target sequence
            aSeq = nonInsertMatched[:][0]
            bSeq = nonInsertMatched[:][1]

            #Label all items not aligned to the target as false
            hitlist = []
            x = 0
            for x in range(0, len(aEncoded) - len(aSeq) + 1):
                aChunk = aEncoded[x:x + len(aSeq)]
                #print aChunk
                if sum(aChunk - aSeq) == 0:
                    break
            hitlist.extend([False] * (x))
            hitlist.extend(list(aSeq - bSeq == 0))
            hitlist.extend([False] * (len(aEncoded) - x - len(aSeq)))
            #Export the target aligned words of the source sequence
            bWords = np.zeros(len(aEncoded), int)
            bWords[x:x + len(bSeq)] = bSeq
            bWordOut = np.array(v.elements())[bWords].tolist()
            hits.append(hitlist)
            iwscore = sum(hitlist) * 100 / float(len(hitlist))
            wscore = np.hstack([wscore, iwscore])
            print bWordOut
            self.source_matchWords.append(bWordOut)
            self.hits = hits
            self.wscore = wscore
Exemple #13
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from ebl.alignment.domain.sequence import NamedSequence
from ebl.tests.factories.fragment import FragmentFactory

signs = "X X ABZ001\nABZ002\nX X X\n"
sequence = Sequence(["ABZ001", "#", "ABZ002", "#", "#"])


def test_of_signs() -> None:
    vocabulary = Vocabulary()
    name = 1234
    named = NamedSequence.of_signs(name, signs, vocabulary)

    assert named.name == str(name)
    assert named.sequence == vocabulary.encodeSequence(sequence)


def test_of_fragment() -> None:
    vocabulary = Vocabulary()
    fragment = FragmentFactory.build(signs=signs)
    named = NamedSequence.of_fragment(fragment, vocabulary)

    assert named.name == str(fragment.number)
    assert named.sequence == vocabulary.encodeSequence(sequence)
Exemple #14
0
for i in range( len(sequence_family)):
    if sequence_family[i][0][0]=='[':
        name.append(sequence_family[i][0][1:-1])
        spilt_pos.append(i)
sequence=[]
for i in spilt_pos:
    ss=sequence_family[i+1][0]
    for ii in range(i+2,i+9):
        ss=ss+sequence_family[ii][0]
    sequence.append(ss)
        
#%%
v = Vocabulary()
sequence_encoded=[]
for i in range(len(sequence)):
    sequence_encoded.append(v.encodeSequence(Sequence(split_sequence(sequence[i]))))

scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)

Matrix=np.zeros(9*9).reshape(9,9)
for i in range(len(sequence_encoded)):
    for j in range(i+1,len(sequence_encoded)):
        score, encodeds = aligner.align(sequence_encoded[i], 
                                        sequence_encoded[j], 
                                        backtrace=True)
        for encoded in encodeds:
            alignment = v.decodeSequenceAlignment(encoded)
            score=np.floor((100-alignment.percentIdentity())*
                        len(np.array(alignment))/100)
            print(i,'-',j,':',score)           
reto = nw.global_align("CEELECANTH", "PELICAN")
reto2 = nw.global_align("(Westf.), Grevener", "††††††(Westf.), Grevener")
reto3 = nw.global_align("(Westf.), Grevener",
                        "††††††(Westf.), Grevener",
                        gap_open=-5,
                        gap_extend=-2)

#import seqanpy
#print(seqanpy.align_global('ACCGGT', 'CCG'))

from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())

a = Sequence("(Westf.), Grevener".split())
b = Sequence("††††††(Westf.), Grevener".split())
# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
def make_sequence(string: str) -> Sequence:
    return Sequence(
        collapse_spaces(
            replace_line_breaks(string).replace(UNCLEAR_OR_UNKNOWN_SIGN,
                                                " ")).split(" "))
    if sequence_family[i][0][0] == '[':
        name.append(sequence_family[i][0][1:-1])
        spilt_pos.append(i)
sequence = []
for i in spilt_pos:
    ss = sequence_family[i + 1][0]
    for ii in range(i + 2, i + 9):
        ss = ss + sequence_family[ii][0]
    sequence.append(ss)

#%%
v = Vocabulary()
sequence_encoded = []
for i in range(len(sequence)):
    sequence_encoded.append(
        v.encodeSequence(Sequence(split_sequence(sequence[i]))))

scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)

Matrix = np.zeros(9 * 9).reshape(9, 9)
for i in range(len(sequence_encoded)):
    for j in range(i + 1, len(sequence_encoded)):
        score, encodeds = aligner.align(sequence_encoded[i],
                                        sequence_encoded[j],
                                        backtrace=True)
        for encoded in encodeds:
            alignment = v.decodeSequenceAlignment(encoded)
            score = np.floor((100 - alignment.percentIdentity()) *
                             len(np.array(alignment)) / 100)
            print(i, j, score)
Exemple #18
0
from fastamasta import FastaReader
from sequence import hamming_dist

if __name__ == "__main__"
    data = [i for i in FastaReader("data/31.fas")]
    a, b = data[0][1], data[1][1]

    if len(a) > len(b):
        b += '-'* ((len(a) - len(b))+1)
    if len(b) > len(a):
        a += '-'* ((len(b) - len(a))+1)



    # Create sequences to be aligned.
    a = Sequence([i for i in a])
    b = Sequence([i for i in b])

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    aEncoded = v.encodeSequence(a)
    bEncoded = v.encodeSequence(b)

    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(3, -1)
    aligner = GlobalSequenceAligner(scoring, -2)
    score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

    # Iterate over optimal alignments and print them.
    for encoded in encodeds:
        alignment = v.decodeSequenceAlignment(encoded)
    def test_utterance_transcriptions(self):
        print('Checking utterance transcriptions...')

        split_directory = self.corpus.split_directory()
        model_directory = self.trainer.align_directory
        with mp.Pool(processes=self.corpus.num_jobs) as pool:
            jobs = [(self, x)
                    for x in range(self.corpus.num_jobs)]
            results = [pool.apply_async(compile_utterance_train_graphs_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Utterance FSTs compiled!')
            print('Decoding utterances (this will take some time)...')
            results = [pool.apply_async(test_utterances_func, args=i) for i in jobs]
            output = [p.get() for p in results]
            print('Finished decoding utterances!')

        word_mapping = self.dictionary.reversed_word_mapping
        v = Vocabulary()
        errors = {}

        for job in range(self.corpus.num_jobs):
            text_path = os.path.join(split_directory, 'text.{}'.format(job))
            texts = load_scp(text_path)
            aligned_int = load_scp(os.path.join(model_directory, 'aligned.{}.int'.format(job)))
            with open(os.path.join(model_directory, 'aligned.{}'.format(job)), 'w') as outf:
                for utt, line in sorted(aligned_int.items()):
                    text = []
                    for t in line:
                        text.append(word_mapping[int(t)])
                    outf.write('{} {}\n'.format(utt, ' '.join(text)))
                    ref_text = texts[utt]
                    if len(text) < len(ref_text) - 7:
                        insertions = [x for x in text if x not in ref_text]
                        deletions = [x for x in ref_text if x not in text]
                    else:
                        aligned_seq = Sequence(text)
                        ref_seq = Sequence(ref_text)

                        alignedEncoded = v.encodeSequence(aligned_seq)
                        refEncoded = v.encodeSequence(ref_seq)
                        scoring = SimpleScoring(2, -1)
                        a = GlobalSequenceAligner(scoring, -2)
                        score, encodeds = a.align(refEncoded, alignedEncoded, backtrace=True)
                        insertions = []
                        deletions = []
                        for encoded in encodeds:
                            alignment = v.decodeSequenceAlignment(encoded)
                            for i, f in enumerate(alignment.first):
                                s = alignment.second[i]
                                if f == '-':
                                    insertions.append(s)
                                if s == '-':
                                    deletions.append(f)
                    if insertions or deletions:
                        errors[utt] = (insertions, deletions, ref_text, text)
        if not errors:
            message = 'There were no utterances with transcription issues.'
        else:
            out_path = os.path.join(self.corpus.output_directory, 'transcription_problems.csv')
            with open(out_path, 'w') as problemf:
                problemf.write('Utterance,Insertions,Deletions,Reference,Decoded\n')
                for utt, (insertions, deletions, ref_text, text) in sorted(errors.items(),
                                                                           key=lambda x: -1 * (
                                                                                   len(x[1][1]) + len(x[1][2]))):
                    problemf.write('{},{},{},{},{}\n'.format(utt, ', '.join(insertions), ', '.join(deletions),
                                                             ' '.join(ref_text), ' '.join(text)))
            message = 'There were {} of {} utterances with at least one transcription issue. '\
                  'Please see the outputted csv file {}.'.format(len(errors), self.corpus.num_utterances, out_path)

        print(self.transcription_analysis_template.format(message))
########## SIMPLEST #######
import regex
regex.search(r'\b(amazing){e<2}\s', 'is life amazing lie ao a')


from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('amazing'.split())
b = Sequence('what a amazing disappointingly bad day'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print alignment
    print 'Alignment score:', alignment.score
    print 'Percent identity:', alignment.percentIdentity()
    print
            speakers.update(bnc_cache[bnc_code][0])
        _, recording_data, transcripts = bnc_cache[bnc_code]
        transcript = transcripts[r_code]
        try:
            tg = TextGrid(strict=False)
            tg.read(tg_path)
        except Exception as e:
            print(out_path)
            print(e)
            continue
        word_tier = tg.getFirst('word')
        #print([x.mark for x in word_tier])
        phone_tier = tg.getFirst('phone')
        trans_ind = 0
        prev_oov = False
        a = Sequence([x[0] for x in transcript])
        b = Sequence([x.mark for x in word_tier])

        # Create a vocabulary and encode the sequences.
        v = Vocabulary()
        aEncoded = v.encodeSequence(a)
        bEncoded = v.encodeSequence(b)

        # Create a scoring and align the sequences using global aligner.
        scoring = SimpleScoring(2, -1)
        aligner = GlobalSequenceAligner(scoring, -2)
        score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

        # Iterate over optimal alignments and print them.
        for encoded in encodeds:
            alignment = v.decodeSequenceAlignment(encoded)
Exemple #22
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner
from alignment.profile import Profile
from alignment.profilealigner import SoftScoring, GlobalProfileAligner

# Create sequences to be aligned.
a = Sequence('what a beautiful day'.split())
b = Sequence('what a disappointingly bad day'.split())
print 'Sequence A:', a
print 'Sequence B:', b
print

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)
print 'Encoded A:', aEncoded
print 'Encoded B:', bEncoded
print

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, alignments = aligner.align(aEncoded, bEncoded, backtrace=True)

# Create sequence profiles out of alignments.
profiles = [Profile.fromSequenceAlignment(a) for a in alignments]
for encoded in profiles:
    profile = v.decodeProfile(encoded)
    print profile
Exemple #23
0
pdist = pairwise_distances(embed, metric="cosine")

triu_inds = np.triu_indices_from(pdist, k=1)
all_path_dists = pdist[triu_inds]

med = np.median(all_path_dists)
# %% [markdown]
# ##

# from skbio.sequence import Sequence
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary

seqs = []
for p in paths:
    s = Sequence(p)
    seqs.append(s)

v = Vocabulary()
encoded_seqs = [v.encodeSequence(s) for s in seqs]


class SimpleScoring:
    def __init__(self, matchScore, mismatchScore):
        self.matchScore = matchScore
        self.mismatchScore = mismatchScore

    def __call__(self, firstElement, secondElement):
        if firstElement == secondElement:
            return self.matchScore
        else:
Exemple #24
0
def text_to_text_alignment_and_score(text_ref, text_pred):
    """
    Find a word to word alignment between two texts, considering the first is 
    the reference and the second the predicted
    :param text_ref: text reference
    :param text_pred: predicted text
    :return: 
    """

    text_ref = text_ref.lower()
    text_pred = text_pred.lower()
    iterable = [".", ","]
    # convert the reference text in order not to contain , and (junk characters)
    translation_map = str.maketrans(to_translation_map(iterable))
    text_ref = text_ref.translate(translation_map)

    # Create sequences to be aligned.
    a = Sequence(text_ref.split())
    b = Sequence(text_pred.split())

    # Create a vocabulary and encode the sequences.
    v = Vocabulary()
    a_enc = v.encodeSequence(a)
    b_enc = v.encodeSequence(b)
    # Create a scoring and align the sequences using global aligner.
    scoring = SimpleScoring(1, 0)
    aligner = GlobalSequenceAligner(scoring, 0)
    f, score, encodeds = aligner.align(a_enc,
                                       b_enc,
                                       text_ref.split(),
                                       text_pred.split(),
                                       backtrace=True)

    # get the first alignment if exists:
    #print(encodeds[0])
    print(encodeds)

    if len(encodeds[0]) > 0:
        alignment = v.decodeSequenceAlignment(encodeds[0])
        print(alignment)
        ##fix first and last missing words of asr text
        list_asr = []
        list_pred = []
        for word in text_pred.split():
            if word != alignment.second.elements[0]:
                list_asr.append(word)
                list_pred.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_pred.split()):
            if word != alignment.second.elements[-1]:
                list_asr = [word] + list_asr
                list_pred.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #fix first and last missing words of reference text
        list_asr = []
        list_pred = []
        for word in text_ref.split():
            if word != alignment.first.elements[0]:
                list_pred.append(word)
                list_asr.append('-')
            else:
                alignment.second.elements = list_asr + alignment.second.elements
                alignment.first.elements = list_pred + alignment.first.elements
                break
        list_asr = []
        list_pred = []
        for word in reversed(text_ref.split()):
            if word != alignment.first.elements[-1]:
                list_pred = [word] + list_asr
                list_asr.append('-')
            else:
                alignment.second.elements = alignment.second.elements + list_asr
                alignment.first.elements = alignment.first.elements + list_pred
                break
        #print(alignment.second.elements)
        #print(alignment.first.elements)
        print(alignment)
        rec = alignment.score * 100 / len(text_ref.split())
        pre = alignment.score * 100 / len(text_pred.split())
    else:
        alignment = []
        rec, pre = 0, 0

    return alignment, rec, pre
Exemple #25
0
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

# Create sequences to be aligned.
a = Sequence('1 1'.split())
b = Sequence('1 2 2 2'.split())

# Create a vocabulary and encode the sequences.
v = Vocabulary()
aEncoded = v.encodeSequence(a)
bEncoded = v.encodeSequence(b)

# Create a scoring and align the sequences using global aligner.
scoring = SimpleScoring(2, -1)
aligner = GlobalSequenceAligner(scoring, -2)
score, encodeds = aligner.align(aEncoded, bEncoded, backtrace=True)

# Iterate over optimal alignments and print them.
for encoded in encodeds:
    alignment = v.decodeSequenceAlignment(encoded)
    print(alignment)
    print('Alignment score:', alignment.score)
    print('Percent identity:', alignment.percentIdentity())
    print()

from dtpattern.alignment.align3 import Needleman, Hirschberg
seqa = list('1112')
seqb = list('1222')

# Align using Needleman-Wunsch algorithm.