예제 #1
0
    def __init__(self, seq1: SequenceTranslation, seq2: SequenceTranslation,
                 match_dict: dict, open_gap_penalty: float, extend_gap_penalty: float):
        self.seq1 = seq1
        self.seq2 = seq2

        alignments = align.localds(seq1.seq_aa, seq2.seq_aa, match_dict, open_gap_penalty, extend_gap_penalty,
                                   one_alignment_only=True)

        if len(alignments) == 0:
            self.value = float('-inf')
        else:
            _, _, self.value, _, _ = alignments[0]
            self.alignment_aa = alignments[0]
예제 #2
0
def format_alignment(mol1: Molecule, mol2: Molecule):
    '''### Do alignment of two molecules
        #### params:
        - mol1, mol2: You includer molecule to align

        *returns* -> Molecule with alignment result and indentity
    '''
    alignment = align.localds(mol1.seq, mol2.seq, blosum62, -12, -4)

    alignment_formated = pairwise2.format_alignment(*alignment[0])
    alignment_formated = alignment_formated.split('\n')

    header = ''
    seq_mol1 = alignment_formated[0]
    seq_mol2 = alignment_formated[2]
    result_raw = alignment_formated[1]
    identity = alignment[0][-1]

    body_mol1 = ''
    body_mol2 = ''
    result = ''
    body = ''
    count = 0
    errors = 0

    for i in range(len(seq_mol1)):
        body_mol1 += seq_mol1[i]
        body_mol2 += seq_mol2[i]
        result += result_raw[i]

        if not seq_mol1[i].isnumeric() and seq_mol1[i] != ' ':
            if seq_mol1[i] == seq_mol2[i]:
                count += 1
            else:
                errors += 1

        if (i + 1) % 60 == 0:
            body += f"{body_mol1}\n{result}\n{body_mol2}\n\n"
            result = ''
            body_mol1 = ''
            body_mol2 = ''

    identity = count / (count + errors)
    header = "< %s - %s | %s | %.1f%%\n" % (mol1.dbname, mol2.dbname,
                                            mol1.name, identity * 100)
    text = header + body

    return {'text': text, 'identity': identity}
예제 #3
0
def findBestAlignment(seq, query, dna=False, offset=0, show=False):
    if not dna:
        alignments = align.localds(seq.replace('*', 'X'), query,
                                   matlist.blosum62, -100, -100)
    else:
        alignments = align.localms(seq, query, 1, -2, -2, -2)

    #     print(seq, query, alignments)
    scores = [a[2] for a in alignments]
    if len(scores) == 0:
        return -1, -1, True
    best = scores.index(max(scores))
    if show:
        print(format_alignment(*alignments[best]))
        print(alignments[best])

    # FR4 start is where both sequence start to align with each other
    # including leading mismatches (these mismatches maybe due to mutations)
    #     0123456
    # eg: GGGGACGTACGTACGT
    #           ||||||||||
    #     ----CAGTACGTACGT
    # although alignment starts at pos 6, we still consider FR4 to start at pos 4
    start = extend5align(alignments[best]) + offset + 1  # 1-based start

    end = int(offset + alignments[best][-1])  # 1-based end

    gapped = False

    # subtract away non-existing '-'s from the seq because seq itself doesn't have these '-'s
    # eg: -GGGACGTACGTACGT
    #      |||||||||||||||
    #     GGGACAGTACGTACGT
    # should start at 1, not 2. because the leading '-' doesn't exist in the actual sequence!
    if '-' in alignments[best][0]:
        start -= alignments[best][0][:(alignments[best][-2] + 1)].count('-')
        end -= alignments[best][0][:(alignments[best][-1] + 1)].count('-')
        gapped = True
    return start, end, gapped  # 1-based
예제 #4
0
파일: Uniprot.py 프로젝트: yaz62/rhapsody
 def _align(self, seqU, seqC, PDBresids, print_info=False):
     algo = self._align_algo_args[0]
     args = self._align_algo_args[1:]
     kwargs = self._align_algo_kwargs
     # align Uniprot and PDB sequences
     al = None
     if algo == 'localxx':
         al = bioalign.localxx(seqU, seqC, *args, **kwargs)
     elif algo == 'localxs':
         al = bioalign.localxs(seqU, seqC, *args, **kwargs)
     else:
         al = bioalign.localds(seqU, seqC, *args, **kwargs)
     if print_info is True:
         info = format_alignment(*al[0])
         LOGGER.info(info[:-1])
         idnt = sum([1 for a1, a2 in zip(al[0][0], al[0][1]) if a1 == a2])
         frac = idnt / len(seqC)
         m = "{} out of {} ({:.1%}) residues".format(idnt, len(seqC), frac)
         m += " in the chain are identical to Uniprot amino acids."
         LOGGER.info(m)
     # compute mapping between Uniprot and PDB chain resids
     aligned_seqU = al[0][0]
     aligned_seqC = al[0][1]
     mp = {}
     resid_U = 0
     resindx_PDB = 0
     for i in range(len(aligned_seqU)):
         aaU = aligned_seqU[i]
         aaC = aligned_seqC[i]
         if aaU != '-':
             resid_U += 1
             if aaC != '-':
                 mp[resid_U] = (PDBresids[resindx_PDB], aaC)
                 r = PDBresids[resindx_PDB]
         if aaC != '-':
             resindx_PDB += 1
     return al[0][:2], mp
예제 #5
0
def findBestMatchedPattern(seq, patterns, extend5end=False):
    """
    find the best matched pattern in a list of patterns
    and classify the type of the alignment (intact, indelled, mismatched, unknown)
    :param seq: nucleotide sequence
    :param patterns: zip iterator (or list) of (pattern_id, pattern_seq, pattern_max_IUPAC_score)
    :param extend5end: since this function uses Local edit distance, it will not favor mismatches and gaps earlier
    than the alignment. Use this flag to get the 'absolute beginning' of match
    :return: tuple of (pattern_id, mismatch_position, indel_position, start_pos (inclusive), end_pos (exclusive)).
    for example: (Oligo1H, 0, 0, 0, 15) means pattern id Oligo1H has the best match with 0 indel/mismatches and
    alignment starts from index 0 until 15: primer_seq[0:15]. If no alignment is ideal, returns (str(nan), 0, 0, -1, -1)

    Note: 0) mismatch_position and indel_position are 1-based index (i.e. starts from 1, not 0) - 0 means no indel/mis
          1) primer_id = 'nan'        => there was no suitable hit - mismatches and indel_pos will be left 0, but you
                                         should (obviously) not interpret that as mismatch at pos 0 or indel at pos 0
          2) mismatch_position = 0    => no mismatches
          3) indel_position = 0       => no indel_position
    """
    NO_MATCH = (str(nan), 0, 0, -1, -1)
    scores = []
    # align the sequence against all possible patterns
    for (id, pattern, maxScore) in patterns:
        alignments = align.localds(seq.upper(), pattern, subMatIUPAC, -5, -5)
        if len(alignments) > 1:
            localScores = [a[2] for a in alignments]
            alignment = alignments[localScores.index(max(localScores))]
        elif len(alignments) > 0:
            alignment = alignments[0]
        else:
            return NO_MATCH
        if alignment:
            alignLen = alignment[-1] - alignment[-2]
            scores.append((id, alignment))
            # if the sequence exactly matches one of the patterns (i.e. got the
            # max possible score from the matrix) ==> intact, return immediately
            if (alignment[2] == maxScore and alignLen == len(pattern)
                    and '-' not in alignment[0] and '-' not in alignment[1]):
                return scores[-1][0], 0, 0, alignment[-2], alignment[-1]
        else:
            scores.append((id, ('', '', 0)))

    # if no exact matching ==> find the best alignment (pattern)
    if len(scores) > 1:
        tmp = map(lambda x: x[1][2], scores)
        bestInd = tmp.index(max(tmp))
    elif len(scores) == 1:
        bestInd = 0
    else:
        return NO_MATCH

    best = list(scores[bestInd])
    best[1] = list(best[1])

    # best = [id, [seq, pattern, score, matchstart, matchend]]
    ID, ALIGNMENT = range(2)
    SEQ, PTN, SCORE, MSTART, MEND = range(5)

    if best[ALIGNMENT][SCORE] == 0:
        return NO_MATCH

    # classify the alignment type ==> insertion, deletion, mismatches

    # Find the position of Indel/Mismatch
    # remove starting indels
    if best[ALIGNMENT][PTN].startswith('-'):
        i = 0
        while best[ALIGNMENT][PTN][i] == '-':
            i += 1
        best[ALIGNMENT][SEQ] = best[ALIGNMENT][SEQ][i:]
        best[ALIGNMENT][PTN] = best[ALIGNMENT][PTN][i:]

    # TODO: revise algorithm
    # find the location of insertion or deletion
    delPos = -1
    if '-' in best[ALIGNMENT][SEQ]:
        delPos = best[ALIGNMENT][SEQ].index('-')
    # if there is a gap at the beginning ==> happened because of insertion/deletion in the middle
    if '-' in best[ALIGNMENT][PTN] and best[ALIGNMENT][PTN].index('-') > delPos \
            and best[ALIGNMENT][MSTART] > 0 and best[ALIGNMENT][MEND] == len(best[ALIGNMENT][SEQ]):
        # -1 because originally had no +1 (whereas the above and below if statements had +1)
        delPos = best[ALIGNMENT][PTN].index('-') - 1
    # if a gap at the end ==>  deletion in the middle
    elif '-' in best[ALIGNMENT][PTN] and best[ALIGNMENT][PTN].index(
            '-') + 1 < delPos:  # and best[1][4] < len(best[1][0]):
        delPos = best[ALIGNMENT][PTN].index('-')

    # find the location of mismatch
    misPos = -1
    # if it is Mismatched ==> length of alignment == length of pattern
    if len(best[ALIGNMENT][SEQ]) == len(patterns[bestInd][1]):
        misPos = 0
        while misPos < len(best[ALIGNMENT][SEQ]):
            # 5 is max score in the substitution matrix
            if subMatIUPAC[(best[ALIGNMENT][SEQ][misPos],
                            patterns[bestInd][ALIGNMENT][misPos])] != 5:
                break
            misPos += 1
    # TODO: revise algorithm

    # 1-based
    """
    GGCCATCGGTCTCCCCC 
    [('alice', ('GGCCATCGGTCTCCCCC', 'GGTCACYG-TCTCYTCA', 43.0, 0, 16)),
     ('bob', ('--GG-CCATC-GGT-CTCCCCC', 'CAGGTBCAGCTGGTGCA-----', 31.0, 2, 16)),
     ('con', ('---GGCCATC-GGT-CTCCCCC', 'CARATGCAGCTGGTGCA-----', 21.0, 6, 16)),
     ('den', ('--GG-CCATC-GGT-CTCCCCC', 'SAGGTCCAGCTGGTACA-----', 31.0, 2, 16)),
     ('fur', ('GGCCATCGGTCTCCCCC-----', '---CA--GRTCACCTTGAAGGA', 26.0, 3, 14))]
    """
    if extend5end:
        return best[ID], misPos + 1, delPos + 1, extend5align(
            best[ALIGNMENT]), best[ALIGNMENT][MEND]
    # don't need to extend 5'end
    return best[ID], misPos + 1, delPos + 1, best[ALIGNMENT][MSTART], best[
        ALIGNMENT][MEND]
예제 #6
0
def get_padding_seqs(ref, sort, index=0):
    pref, psort, _, _, _ = align.localds(ref, sort, blosum62, -10, -1)[index]
    return pref, psort