def __init__(self, seq1: SequenceTranslation, seq2: SequenceTranslation, match_dict: dict, open_gap_penalty: float, extend_gap_penalty: float): self.seq1 = seq1 self.seq2 = seq2 alignments = align.localds(seq1.seq_aa, seq2.seq_aa, match_dict, open_gap_penalty, extend_gap_penalty, one_alignment_only=True) if len(alignments) == 0: self.value = float('-inf') else: _, _, self.value, _, _ = alignments[0] self.alignment_aa = alignments[0]
def format_alignment(mol1: Molecule, mol2: Molecule): '''### Do alignment of two molecules #### params: - mol1, mol2: You includer molecule to align *returns* -> Molecule with alignment result and indentity ''' alignment = align.localds(mol1.seq, mol2.seq, blosum62, -12, -4) alignment_formated = pairwise2.format_alignment(*alignment[0]) alignment_formated = alignment_formated.split('\n') header = '' seq_mol1 = alignment_formated[0] seq_mol2 = alignment_formated[2] result_raw = alignment_formated[1] identity = alignment[0][-1] body_mol1 = '' body_mol2 = '' result = '' body = '' count = 0 errors = 0 for i in range(len(seq_mol1)): body_mol1 += seq_mol1[i] body_mol2 += seq_mol2[i] result += result_raw[i] if not seq_mol1[i].isnumeric() and seq_mol1[i] != ' ': if seq_mol1[i] == seq_mol2[i]: count += 1 else: errors += 1 if (i + 1) % 60 == 0: body += f"{body_mol1}\n{result}\n{body_mol2}\n\n" result = '' body_mol1 = '' body_mol2 = '' identity = count / (count + errors) header = "< %s - %s | %s | %.1f%%\n" % (mol1.dbname, mol2.dbname, mol1.name, identity * 100) text = header + body return {'text': text, 'identity': identity}
def findBestAlignment(seq, query, dna=False, offset=0, show=False): if not dna: alignments = align.localds(seq.replace('*', 'X'), query, matlist.blosum62, -100, -100) else: alignments = align.localms(seq, query, 1, -2, -2, -2) # print(seq, query, alignments) scores = [a[2] for a in alignments] if len(scores) == 0: return -1, -1, True best = scores.index(max(scores)) if show: print(format_alignment(*alignments[best])) print(alignments[best]) # FR4 start is where both sequence start to align with each other # including leading mismatches (these mismatches maybe due to mutations) # 0123456 # eg: GGGGACGTACGTACGT # |||||||||| # ----CAGTACGTACGT # although alignment starts at pos 6, we still consider FR4 to start at pos 4 start = extend5align(alignments[best]) + offset + 1 # 1-based start end = int(offset + alignments[best][-1]) # 1-based end gapped = False # subtract away non-existing '-'s from the seq because seq itself doesn't have these '-'s # eg: -GGGACGTACGTACGT # ||||||||||||||| # GGGACAGTACGTACGT # should start at 1, not 2. because the leading '-' doesn't exist in the actual sequence! if '-' in alignments[best][0]: start -= alignments[best][0][:(alignments[best][-2] + 1)].count('-') end -= alignments[best][0][:(alignments[best][-1] + 1)].count('-') gapped = True return start, end, gapped # 1-based
def _align(self, seqU, seqC, PDBresids, print_info=False): algo = self._align_algo_args[0] args = self._align_algo_args[1:] kwargs = self._align_algo_kwargs # align Uniprot and PDB sequences al = None if algo == 'localxx': al = bioalign.localxx(seqU, seqC, *args, **kwargs) elif algo == 'localxs': al = bioalign.localxs(seqU, seqC, *args, **kwargs) else: al = bioalign.localds(seqU, seqC, *args, **kwargs) if print_info is True: info = format_alignment(*al[0]) LOGGER.info(info[:-1]) idnt = sum([1 for a1, a2 in zip(al[0][0], al[0][1]) if a1 == a2]) frac = idnt / len(seqC) m = "{} out of {} ({:.1%}) residues".format(idnt, len(seqC), frac) m += " in the chain are identical to Uniprot amino acids." LOGGER.info(m) # compute mapping between Uniprot and PDB chain resids aligned_seqU = al[0][0] aligned_seqC = al[0][1] mp = {} resid_U = 0 resindx_PDB = 0 for i in range(len(aligned_seqU)): aaU = aligned_seqU[i] aaC = aligned_seqC[i] if aaU != '-': resid_U += 1 if aaC != '-': mp[resid_U] = (PDBresids[resindx_PDB], aaC) r = PDBresids[resindx_PDB] if aaC != '-': resindx_PDB += 1 return al[0][:2], mp
def findBestMatchedPattern(seq, patterns, extend5end=False): """ find the best matched pattern in a list of patterns and classify the type of the alignment (intact, indelled, mismatched, unknown) :param seq: nucleotide sequence :param patterns: zip iterator (or list) of (pattern_id, pattern_seq, pattern_max_IUPAC_score) :param extend5end: since this function uses Local edit distance, it will not favor mismatches and gaps earlier than the alignment. Use this flag to get the 'absolute beginning' of match :return: tuple of (pattern_id, mismatch_position, indel_position, start_pos (inclusive), end_pos (exclusive)). for example: (Oligo1H, 0, 0, 0, 15) means pattern id Oligo1H has the best match with 0 indel/mismatches and alignment starts from index 0 until 15: primer_seq[0:15]. If no alignment is ideal, returns (str(nan), 0, 0, -1, -1) Note: 0) mismatch_position and indel_position are 1-based index (i.e. starts from 1, not 0) - 0 means no indel/mis 1) primer_id = 'nan' => there was no suitable hit - mismatches and indel_pos will be left 0, but you should (obviously) not interpret that as mismatch at pos 0 or indel at pos 0 2) mismatch_position = 0 => no mismatches 3) indel_position = 0 => no indel_position """ NO_MATCH = (str(nan), 0, 0, -1, -1) scores = [] # align the sequence against all possible patterns for (id, pattern, maxScore) in patterns: alignments = align.localds(seq.upper(), pattern, subMatIUPAC, -5, -5) if len(alignments) > 1: localScores = [a[2] for a in alignments] alignment = alignments[localScores.index(max(localScores))] elif len(alignments) > 0: alignment = alignments[0] else: return NO_MATCH if alignment: alignLen = alignment[-1] - alignment[-2] scores.append((id, alignment)) # if the sequence exactly matches one of the patterns (i.e. got the # max possible score from the matrix) ==> intact, return immediately if (alignment[2] == maxScore and alignLen == len(pattern) and '-' not in alignment[0] and '-' not in alignment[1]): return scores[-1][0], 0, 0, alignment[-2], alignment[-1] else: scores.append((id, ('', '', 0))) # if no exact matching ==> find the best alignment (pattern) if len(scores) > 1: tmp = map(lambda x: x[1][2], scores) bestInd = tmp.index(max(tmp)) elif len(scores) == 1: bestInd = 0 else: return NO_MATCH best = list(scores[bestInd]) best[1] = list(best[1]) # best = [id, [seq, pattern, score, matchstart, matchend]] ID, ALIGNMENT = range(2) SEQ, PTN, SCORE, MSTART, MEND = range(5) if best[ALIGNMENT][SCORE] == 0: return NO_MATCH # classify the alignment type ==> insertion, deletion, mismatches # Find the position of Indel/Mismatch # remove starting indels if best[ALIGNMENT][PTN].startswith('-'): i = 0 while best[ALIGNMENT][PTN][i] == '-': i += 1 best[ALIGNMENT][SEQ] = best[ALIGNMENT][SEQ][i:] best[ALIGNMENT][PTN] = best[ALIGNMENT][PTN][i:] # TODO: revise algorithm # find the location of insertion or deletion delPos = -1 if '-' in best[ALIGNMENT][SEQ]: delPos = best[ALIGNMENT][SEQ].index('-') # if there is a gap at the beginning ==> happened because of insertion/deletion in the middle if '-' in best[ALIGNMENT][PTN] and best[ALIGNMENT][PTN].index('-') > delPos \ and best[ALIGNMENT][MSTART] > 0 and best[ALIGNMENT][MEND] == len(best[ALIGNMENT][SEQ]): # -1 because originally had no +1 (whereas the above and below if statements had +1) delPos = best[ALIGNMENT][PTN].index('-') - 1 # if a gap at the end ==> deletion in the middle elif '-' in best[ALIGNMENT][PTN] and best[ALIGNMENT][PTN].index( '-') + 1 < delPos: # and best[1][4] < len(best[1][0]): delPos = best[ALIGNMENT][PTN].index('-') # find the location of mismatch misPos = -1 # if it is Mismatched ==> length of alignment == length of pattern if len(best[ALIGNMENT][SEQ]) == len(patterns[bestInd][1]): misPos = 0 while misPos < len(best[ALIGNMENT][SEQ]): # 5 is max score in the substitution matrix if subMatIUPAC[(best[ALIGNMENT][SEQ][misPos], patterns[bestInd][ALIGNMENT][misPos])] != 5: break misPos += 1 # TODO: revise algorithm # 1-based """ GGCCATCGGTCTCCCCC [('alice', ('GGCCATCGGTCTCCCCC', 'GGTCACYG-TCTCYTCA', 43.0, 0, 16)), ('bob', ('--GG-CCATC-GGT-CTCCCCC', 'CAGGTBCAGCTGGTGCA-----', 31.0, 2, 16)), ('con', ('---GGCCATC-GGT-CTCCCCC', 'CARATGCAGCTGGTGCA-----', 21.0, 6, 16)), ('den', ('--GG-CCATC-GGT-CTCCCCC', 'SAGGTCCAGCTGGTACA-----', 31.0, 2, 16)), ('fur', ('GGCCATCGGTCTCCCCC-----', '---CA--GRTCACCTTGAAGGA', 26.0, 3, 14))] """ if extend5end: return best[ID], misPos + 1, delPos + 1, extend5align( best[ALIGNMENT]), best[ALIGNMENT][MEND] # don't need to extend 5'end return best[ID], misPos + 1, delPos + 1, best[ALIGNMENT][MSTART], best[ ALIGNMENT][MEND]
def get_padding_seqs(ref, sort, index=0): pref, psort, _, _, _ = align.localds(ref, sort, blosum62, -10, -1)[index] return pref, psort