def _align_clusters(config, one, two, cutoff=0.3): """Constructs a cluster alignment using the given configuration.""" LOG.info("%s vs %s", one.name, two.name) aligner = Align.PairwiseAligner() matrix = config.pop("substitution_matrix", "BLOSUM62") if matrix not in substitution_matrices.load(): LOG.warning( "Invalid substitution matrix (%s), defaulting to BLOSUM62", matrix) matrix = "BLOSUM62" aligner.substitution_matrix = substitution_matrices.load(matrix) for k, v in config.items(): setattr(aligner, k, v) alignment = Alignment(query=one, target=two) for locusA, locusB in product(one.loci, two.loci): for geneA, geneB in product(locusA.genes, locusB.genes): if not geneA.translation or not geneB.translation: continue aln = aligner.align(geneA.translation, geneB.translation) identity, similarity = compute_identity(aln[0]) if identity < cutoff: continue alignment.add_link(geneA, geneB, identity, similarity) return alignment
def _align_clusters(config, one, two, cutoff=0.3): """Constructs a cluster alignment using the given configuration.""" LOG.info("%s vs %s", one.name, two.name) aligner = Align.PairwiseAligner() # Select the substitution matrix. # Defaults to BLOSUM62 when none or invalid matrix specified. matrix = config.pop("substitution_matrix", "BLOSUM62") if matrix not in substitution_matrices.load(): LOG.warning( "Invalid substitution matrix '(%s)', defaulting to BLOSUM62", matrix) matrix = "BLOSUM62" aligner.substitution_matrix = substitution_matrices.load(matrix) # ValueError is thrown during sequence alignment when a letter # in the sequence is not found in the substitution matrix. # Extended IUPAC codes (BXZJUO) are added to mitigate this. extend_matrix_alphabet(aligner.substitution_matrix, codes='BXZJUO') for k, v in config.items(): setattr(aligner, k, v) alignment = Alignment(query=one, target=two) for locusA, locusB in product(one.loci, two.loci): for geneA, geneB in product(locusA.genes, locusB.genes): if not geneA.translation or not geneB.translation: continue aln = aligner.align(geneA.translation, geneB.translation) identity, similarity = compute_identity(aln[0]) if identity < cutoff: continue alignment.add_link(geneA, geneB, identity, similarity) return alignment
def align_sequences_match_residues(mobile_seq, target_seq, seq_align_mat='BLOSUM80', gap_penalty=-1.0, verbosity=0): """ Align two aminoacid sequences using Bio.pairwise2.globalds and substution matrix seq_align_mat, return a tuple with two list of residues to be used in the 3D alignment (mobile, refence) :param str mobile_seq: sequence of mobile protein :param str target_seq: sequence of target protein :param str seq_align_mat: use this substution matrix from Bio.SubsMat.MatrixInfo :param float gap_penalty: gap penalty to the alignment; avoid values too low in module :param int verbosity: sets the verbosity level :rtype: tuple """ try: from Bio.pairwise2 import align from Bio.Align import substitution_matrices seq_align_mat = substitution_matrices.load(seq_align_mat) except ImportError as error: os_util.local_print( 'Failed to import Biopython with error: {}\nBiopython is necessary to sequence' 'alignment. Sequences to be aligned:\nReference: {}\nMobile: {}' ''.format(error, target_seq, mobile_seq), msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity) raise ImportError(error) except FileNotFoundError as error: available_matrices = substitution_matrices.load() os_util.local_print( 'Failed to import substitution matrix {} with error: {}\nSubstitution matrix must be one ' 'of: {})' ''.format(seq_align_mat, error, available_matrices), msg_verbosity=os_util.verbosity_level.error, current_verbosity=verbosity) raise FileNotFoundError(error) else: align_result = align.globalds(target_seq, mobile_seq, seq_align_mat, gap_penalty, gap_penalty)[0] os_util.local_print( 'This is the alignment result to be used in protein alignment:\n{}' ''.format(align_result), msg_verbosity=os_util.verbosity_level.info, current_verbosity=verbosity) ref_align_str = [ True if res_j != '-' else False for res_i, res_j in zip(align_result[0], align_result[1]) if res_i != '-' ] mob_align_str = [ True if res_i != '-' else False for res_i, res_j in zip(align_result[0], align_result[1]) if res_j != '-' ] return mob_align_str, ref_align_str
def needle_alignment(s1, s2): ''' DESCRIPTION Does a Needleman-Wunsch Alignment of sequence s1 and s2 and returns a Bio.Align.MultipleSeqAlignment object. ''' from Bio import pairwise2 from Bio.Align import MultipleSeqAlignment from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq try: from Bio.Align import substitution_matrices except ImportError: from Bio.SubsMat.MatrixInfo import blosum62 else: blosum62 = substitution_matrices.load("BLOSUM62") def match_callback(c1, c2): return blosum62.get((c1, c2), 1 if c1 == c2 else -4) alns = pairwise2.align.globalcs(s1, s2, match_callback, -10., -.5, one_alignment_only=True) return MultipleSeqAlignment([ SeqRecord(Seq(alns[0][0]), "s1"), SeqRecord(Seq(alns[0][1]), "s2"), ])
def check_for_qrdr_mutations(hits_dict, contigs, qrdr, min_ident, min_cov): qrdr_loci = { 'GyrA': [(83, 'S'), (87, 'D')], 'ParC': [(80, 'S'), (84, 'E')] } gyra_ref = 'MSDLAREITPVNIEEELKNSYLDYAMSVIVGRALPDVRDGLKPVHRRVLYAMNVLGNDWN' \ 'KAYKKSARVVGDVIGKYHPHGDSAVYDTIVRMAQPFSLRYMLVDGQGNFGSIDGDSAAAM' parc_ref = 'MSDMAERLALHEFTENAYLNYSMYVIMDRALPFIGDGLKPVQRRIVYAMSELGLNASAKF' \ 'KKSARTVGDVLGKYHPHGDSACYEAMVLMAQPFSYRYPLVDGQGNWGAPDDPKSFAAMRY' blosum62 = substitution_matrices.load('BLOSUM62') snps = [] hits = run_blastn(qrdr, contigs, None, min_ident) for hit in hits: _, coverage, translation = truncation_check(hit) if coverage > min_cov: if hit.gene_id == 'GyrA': alignments = pairwise2.align.globalds(gyra_ref, translation, blosum62, -10, -0.5) elif hit.gene_id == 'ParC': alignments = pairwise2.align.globalds(parc_ref, translation, blosum62, -10, -0.5) else: assert False bases_per_ref_pos = get_bases_per_ref_pos(alignments[0]) loci = qrdr_loci[hit.gene_id] for pos, wt_base in loci: assembly_base = bases_per_ref_pos[pos] if pos in bases_per_ref_pos and assembly_base != wt_base and \ assembly_base != '-' and assembly_base != '.': snps.append(hit.gene_id + '-' + str(pos) + assembly_base) if snps: hits_dict['Flq_mutations'] += snps
def biopython_align(qseq, tseq, param, table=False, strict=False): # Query and target sequences. q = str(qseq.seq) t = str(tseq.seq) aligner = Align.PairwiseAligner() # Select local mode. Global, semiglobal are about scoring. if param.mode == const.LOCAL_ALIGN: aligner.mode = 'local' # Attempts to detect DNA vs peptide sequences. param.is_dna = all(x in "ATGC" for x in q[:100]) # Default substituion matrix. if not param.matrix: param.matrix = 'NUC.4.4' if param.is_dna else 'BLOSUM62' # Apply substitution matrix. aligner.substitution_matrix = substitution_matrices.load(param.matrix) # Gap scoring. aligner.open_gap_score = -param.gap_open aligner.extend_gap_score = -param.gap_extend # End gap scoring. if strict: aligner.target_end_open_gap_score = -param.gap_open aligner.target_end_extend_gap_score = -param.gap_extend aligner.query_end_open_gap_score = -param.gap_open aligner.query_end_extend_gap_score = -param.gap_extend else: aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # Semiglobal will override strict mode. if param.mode == const.SEMIGLOBAL_ALIGN: aligner.target_end_gap_score = 0.0 aligner.query_end_gap_score = 0.0 # Biopython alignment target to query. alns = aligner.align(t, q) # Reformat alignments as a more detailed class. def builder(aln): rec = Alignment(qseq=qseq, tseq=tseq, aln=aln, param=param) return rec alns = map(builder, alns) # Format the aligners if table: print_func = print_tabular else: print_func = print_pairwise for index, aln in enumerate(alns): print_func(aln, param=param, index=index)
def map_seqs(obj, ref, segid_obj=None, segid_ref=None, matrix='BLOSUM62'): """ given two sequences obj and ref return a mapping dict map_obj2ref_fullseq={(segid,0-based pos):(segid,0-based pos)} """ aligner = Align.PairwiseAligner() aligner.substitution_matrix = substitution_matrices.load(matrix) best_score = 0 best_aln = 'no' i = 0 for a in aligner.align(str(obj), str(ref)): if (a.score > best_score): best_score = a.score best_aln = a i = i + 1 if i > 100: # we analyze only first 100 alignments break t2q = {} for i, j in zip(best_aln.aligned[0], best_aln.aligned[1]): for x, y in zip(range(*i), range(*j)): t2q[x] = y if segid_obj is None: return t2q else: return {(segid_obj, k): (segid_ref, v) for k, v in t2q.items()}
def load_matrix(name: str) -> substitution_matrices.Array: """ Loads a substitution matrix from the ones built in the Biopython library. :param name: The name of the matrix to load. :return: The chosen substitution matrix. """ return substitution_matrices.load(name)
def set_scores(): # checks which alignment type if object['alignment_type'] == "local": aligner.mode = 'local' elif object['alignment_type'] != "global": raise ValueError( f"Alignment type {object['alignment_type']} asked is not available or does not exist" ) """ Checks if a substitution matrix has been chosen if not it requires the match/mismatch score """ if object['substitution_matrix']: from Bio.Align import substitution_matrices try: aligner.substitution_matrix = substitution_matrices.load( object['substitution_matrix']) except: raise FileNotFoundError( f"There's No {object['substitution_matrix']} matrix") elif not object['substitution_matrix']: aligner.match_score = object['match_score'] aligner.mismatch_score = object['mismatch_score'] else: raise ValueError( f"Score schema must be 'LOCAL/GLOBAL' not {object['score-schema']}" ) if object['score_schema'] == 'simple': aligner.gap_score = object['gap_score'] elif object['score_schema'] == 'complex': aligner.target_internal_open_gap_score = object[ 'target_internal_open_gap_score'] aligner.target_internal_extend_gap_score = object[ 'target_internal_extend_gap_score'] aligner.target_left_open_gap_score = object[ 'target_left_open_gap_score'] aligner.target_left_extend_gap_score = object[ 'target_left_extend_gap_score'] aligner.target_right_open_gap_score = object[ 'target_right_open_gap_score'] aligner.target_right_extend_gap_score = object[ 'target_right_extend_gap_score'] aligner.query_internal_open_gap_score = object[ 'query_internal_open_gap_score'] aligner.query_internal_extend_gap_score = object[ 'query_internal_extend_gap_score'] aligner.query_left_open_gap_score = object[ 'query_left_open_gap_score'] aligner.query_left_extend_gap_score = object[ 'query_left_extend_gap_score'] aligner.query_right_open_gap_score = object[ 'query_right_open_gap_score'] aligner.query_right_extend_gap_score = object[ 'query_right_extend_gap_score']
def matrix_offer(): """ Provides a list of available substitution matrices. :return: Prints a numbered list of matrices. """ mxs = substitution_matrices.load() for elem in range(len(mxs)): print(str((elem + 1)) + " " + mxs[elem])
def scores_pairwise(ref: str, seq: str): """ :return: scores for aligning each reference amino acid to each amino acid of our sequence `seq` """ subst_mtx = substitution_matrices.load('BLOSUM50') res = np.empty([len(seq), len(ref)]) for c_idx, c in enumerate(seq): res[c_idx] = [subst_mtx[(r, c)] for r in ref] return res
def align(seqA, seqB, sigma=5): sigma = -abs(sigma) mat = substitution_matrices.load("PAM250") alignments = pairwise2.align.localds(seqA, seqB, match_dict=mat, open=sigma, extend=sigma) alignments.sort(key=lambda x: x.score, reverse=True) print(pairwise2.format_alignment(*alignments[0]))
def BLOSUM45_score_dist(s1, s2): aligner = Align.PairwiseAligner() aligner.open_gap_score = -10 aligner.substitution_matrix = substitution_matrices.load("BLOSUM45") aligner.mode = "global" score_s12 = aligner.score(s1, s2) score11 = aligner.score(s1, s1) score22 = aligner.score(s2, s2) distance = 1 - score_s12 / max(score11, score22) return distance
def matrix_choice(num: int) -> str: """ Returns a name of the matrix based on its number in the list from the matrix_offer function. :param num: The position of the matrix in the list from the matrix_offer function. :return: Name of the matrix at a given position. """ mxs = substitution_matrices.load() return mxs[num - 1]
def _get_protein_similarity(self, seq1, seq2, matrix="BLOSUM62", gap_open=-10, gap_extend=-0.5): mat = substitution_matrices.load(name=matrix) alns = pairwise2.align.globalds(seq1, seq2, mat, gap_open, gap_extend) top_aln = alns[0] aln_human, aln_mouse, score, begin, end = top_aln return score / len(seq1)
def align_sequences(self, structA, structB): """ Performs a global pairwise alignment between two sequences using the BLOSUM62 matrix and the Needleman-Wunsch algorithm as implemented in Biopython. Returns the alignment, the sequence identity and the residue mapping between both original sequences. """ def _get_pdb_sequence(structure): """ Retrieves the AA sequence from a PDB structure. """ _aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, "X")) seq = [_aainfo(r) for r in structure.get_residues() if is_aa(r)] return seq resseq_A = _get_pdb_sequence(structA) resseq_B = _get_pdb_sequence(structB) sequence_A = "".join([i[1] for i in resseq_A]) sequence_B = "".join([i[1] for i in resseq_B]) alns = pairwise2.align.globalds( sequence_A, sequence_B, substitution_matrices.load("BLOSUM62"), one_alignment_only=True, open=-10.0, extend=-0.5, penalize_end_gaps=(False, False), ) best_aln = alns[0] aligned_A, aligned_B, score, begin, end = best_aln # Equivalent residue numbering # Relative to reference mapping = {} aa_i_A, aa_i_B = 0, 0 for aln_i, (aa_aln_A, aa_aln_B) in enumerate(zip(aligned_A, aligned_B)): if aa_aln_A == "-": if aa_aln_B != "-": aa_i_B += 1 elif aa_aln_B == "-": if aa_aln_A != "-": aa_i_A += 1 else: assert resseq_A[aa_i_A][1] == aa_aln_A assert resseq_B[aa_i_B][1] == aa_aln_B mapping[resseq_A[aa_i_A][0]] = resseq_B[aa_i_B][0] aa_i_A += 1 aa_i_B += 1 return mapping
def __load_matrix(self, mname): matrix = dict() orig_mname = self.mnames[mname] omatrix = substitution_matrices.load(orig_mname) for key, val in omatrix.items(): if key[::-1] in omatrix and omatrix[key[::-1]] != val: raise KeyError((key, val, key[::-1], omatrix[key[::-1]])) matrix["".join(key)] = val matrix["".join(key[::-1])] = val for key in orig_mname, orig_mname.lower(), orig_mname.upper(): self.__matrices[key] = matrix
def __init__(self): mnames = substitution_matrices.load() self.mnames = dict() for mname in mnames: self.mnames[mname] = mname self.mnames[mname.upper()] = mname self.mnames[mname.lower()] = mname self.__matrices = dict()
def align(seqA, seqB, opening=11, extension=1): opening = -abs(opening) extension = -abs(extension) mat = substitution_matrices.load("BLOSUM62") alignments = pairwise2.align.globaldc( seqA, seqB, match_dict=mat, gap_A_fn=create_gap(opening, extension), gap_B_fn=create_gap(opening, extension)) alignments.sort(key=lambda x: x.score, reverse=True) print(pairwise2.format_alignment(*alignments[0]))
def alignUsingLinearSpace(v,w, replace_score = substitution_matrices.load("BLOSUM62"), indel_cost = 5): def isRightOrDownRight(midEdge): return midEdge==RIGHT or midEdge==DOWNRIGHT def isDownOrDownRight(midEdge): return midEdge==DOWN or midEdge==DOWNRIGHT # MiddleNodeAndEdge # # An adapter which replaces MiddleNode and MiddleEdge in the pseudocode, and calls FindMiddleEdge def MiddleNodeAndEdge(top, bottom, left, right): ((i1,j1),(i2,j2)) = FindMiddleEdge(v[top:bottom],w[left:right],replace_score=replace_score,indel_cost=indel_cost) direction = RIGHT if i1==i2 else DOWN if j1==j2 else DOWNRIGHT return j1,direction # LinearSpaceAlignment # # Find longest path between a substring of v[top] v[bottom-1] # and w[left] and w[right-1] # # Inputs: top # bottom # left # right def LinearSpaceAlignment(top, bottom, left, right): if left==right: return indel_cost*(bottom - top) if top==bottom: return indel_cost*(right-left) middle = (left + right)//2 midNode,midEdge = MiddleNodeAndEdge(top, bottom, left, right) LinearSpaceAlignment(top, midNode, left, middle) # output midEdge if isRightOrDownRight(midEdge): middle += 1 if isDownOrDownRight(midEdge): midNode+= 1 LinearSpaceAlignment(midNode, bottom, middle, right) RIGHT = 0 DOWN = 1 DOWNRIGHT = 2 LinearSpaceAlignment(0,len(v)+1,0,len(w)+1)
def align_with_blosum62(aa_seq1, aa_seq2): """ Creates a protein alignment when given two amino acid sequences. Tuple of top alignments is returned """ # note: depending on the sequence homology it may make sense to use # another blosum matrix (or different gap_open gap_close) # matrix = matlist.align_with_blosum62 matrix = substitution_matrices.load("BLOSUM62") gap_open = -12 # cost to open a gap gap_extend = -3 # cost to extend a gap alignments = pairwise2.align.globalds(aa_seq1, aa_seq2, matrix, gap_open, gap_extend) return (alignments[0])
def getBLOSUMDistanceMatrix(alignment): blosumMatrix = substitution_matrices.load("BLOSUM62") df = pd.DataFrame(columns=list(r.id for r in alignment), index=list(r.id for r in alignment)) for record1 in alignment: for record2 in alignment: score = 0 for i in range(len(record1.seq)): aa1 = record1[i] if record1[i] != '-' else '*' aa2 = record2[i] if record2[i] != '-' else '*' score -= blosumMatrix[aa1][aa2] df[record1.id][record2.id] = score return df.apply(pd.to_numeric)
def sequence_similarity( sequence1: str, sequence2: str, open_gap_penalty: int = -11, extend_gap_penalty: int = -1, substitution_matrix: str = "BLOSUM62", ) -> float: """ Calculate the squence similarity of two amino acid sequences. Parameters ---------- sequence1: str The first sequence. sequence2: str The second sequence. open_gap_penalty: int The penalty to open a gap. extend_gap_penalty: int The penalty to extend a gap. substitution_matrix: str The substitution matrix to use during alignment. Available matrices can be found via: >>> from Bio.Align import substitution_matrices >>> substitution_matrices.load() Returns ------- score: float Similarity of sequences. """ from Bio import pairwise2 from Bio.Align import substitution_matrices substitution_matrix = substitution_matrices.load(substitution_matrix) # replace any characters unknown to the substitution matrix by * sequence1_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence1]) sequence2_clean = "".join([x if x in substitution_matrix.alphabet else "*" for x in sequence2]) score = pairwise2.align.globalds( sequence1_clean, sequence2_clean, substitution_matrix, open_gap_penalty, extend_gap_penalty, score_only=True, ) return score
def main(): q = Queue(connection=conn) mat_name = "BLOSUM62" matrix = substitution_matrices.load(mat_name) aligner = Align.PairwiseAligner() # aligner.substitution_matrix = matrix job = q.enqueue(global_align, args=(aligner, x, y, matrix)) # alignments = global_align() count = 0 while True: if job.result != None or count > 100000: break time.sleep(2) count += 1 print(f'job.get_id(): {job.get_id()}, ' f'job.result:{job.result}') alignments = job.result print(f'alignments[0]:{alignments[0]}\n score: {alignments[0].score}')
def tester(): q = Queue(connection=conn2) mat_name = "BLOSUM62" matrix = substitution_matrices.load(mat_name) #aligner = Align.PairwiseAligner() # aligner.substitution_matrix = matrix job = q.enqueue(local_align, args=(x, y, matrix)) # alignments = global_align() count = 0 while True: if job.result is not None or count > 100: #print(f'result: {job.result}') break time.sleep(1) count += 1 print(f'job.get_id(): {job.get_id()}\n') #f'job result: {job.result}') #only returning one alignment = job.result seqA, connector, seqB = get_protein_alignment(alignment) #todo: this is just returning seqA seqA_adj = make_single_seq(seqA, connector) seqB_adj = make_single_seq(seqB, connector) #assert len(seqA_adj) == len(seqB_adj) == len(connector) print(f'strA:\n{str(seqA)}\n' f'strB:\n{str(seqB)}') print(f'adjA:\n {seqA_adj}\n, adjB:\n {seqB_adj}') #print(type(alignment)) #print(f'alignments[0]:{alignment}\n score: {alignment.score}') # practice pretty printing count = 0 while True: print(f'line: {count}') print(seqA[count * 50:(count * 50) + 50]) #print('\n') print(connector[count * 50:(count * 50) + 50]) #print('\n') print(seqB[count * 50:(count * 50) + 50]) #print('\n') if (count * 50) + 50 > len(connector): break count += 1
def __init__(self, model="identity", skip_letters=None): """Initialize with a distance model.""" # Shim for backward compatibility (#491) if skip_letters: self.skip_letters = skip_letters elif model == "identity": self.skip_letters = () else: self.skip_letters = ("-", "*") if model == "identity": self.scoring_matrix = None elif model in self.models: if model == "blastn": name = "NUC.4.4" else: name = model.upper() self.scoring_matrix = substitution_matrices.load(name) else: raise ValueError("Model not supported. Available models: " + ", ".join(self.models))
def gen_aligner(): """Create the global sequence aligner the parameters are the same as EMBOSS Needle. """ aligner = Align.PairwiseAligner() aligner.mode = "global" aligner.open_gap_score = -10 aligner.extend_gap_score = -0.5 # Tweak the matrix in BioPython 0.78+ to accomodate amino acid U # See https://github.com/biopython/biopython/issues/3205 # Otherwise use the default matrix and replace U's with X's # aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") sub_mat = substitution_matrices.load("BLOSUM62") sub_mat = sub_mat.select(sub_mat.alphabet + "U") # Make U score like X sub_mat[:, -1] = -4 sub_mat[-1, :] = -4 sub_mat[-1, -1] = 1 aligner.substitution_matrix = sub_mat return aligner
def max_alignment_score(seq1, seq2): """Calculate & return max alignment score between seq1 and seq2""" #Create aligner object which will store alignment parameters aligner = Align.PairwiseAligner() #Set parameters aligner.mode = "global" blosum62 = substitution_matrices.load("BLOSUM62") aligner.substitution_matrix = blosum62 #Gap open penalty & gap extension penalty both set to -5 as using linear gap penalty equal to 5 aligner.open_gap_score = -5 aligner.extend_gap_score = -5 #Calculate & print optimal alignment alignments = aligner.align(seq1, seq2) print(alignments[0]) #Calculate & print alignment score score = aligner.score(seq1, seq2) return score
def _local_align(self, record_a: SeqRecord, record_b: SeqRecord, open_gap_score: int): aligner = Align.PairwiseAligner() aligner.mode = 'local' aligner.substitution_matrix = substitution_matrices.load('BLOSUM62') aligner.open_gap_score = open_gap_score aligner.extend_gap_score = -1 aln = aligner.align( record_a.seq.ungap('-').upper(), record_b.seq.ungap('-').upper())[0] seq_a = Seq( str(aln).splitlines()[0].replace(' ', '-'), generic_protein) seq_b = Seq( str(aln).splitlines()[2].replace(' ', '-'), generic_protein) return MultipleSeqAlignment([ SeqRecord(seq_a, id=record_a.id), SeqRecord(seq_b, id=record_b.id) ], annotations={ 'score': aln.score, 'path': aln.path, 'aligned': aln.aligned })
def get_sequence_alignment(sequence_1, sequence_2, mode='global', open_gap_score=-11, extend_gap_score=-2): """Perform a sequence alignment using Needleman-Wunsch algorithm. :param sequence_1: First input sequence. :type sequence_1: str :param sequence_2: Second input sequence. :type sequence_2: str :param mode: Alignment mode, defaults to 'global'. :type mode: str, optional :param open_gap_score: Opening gap penalty, defaults to -11. :type open_gap_score: int, optional :param extend_gap_score: Extension gap penalty, defaults to -2. :type extend_gap_score: int, optional :return alignment_dict: Dictionary with the residue mapping between both input sequences. :rtype alignment_dict: dict [int, int] """ aligner = Align.PairwiseAligner() aligner.mode = mode aligner.substitution_matrix = substitution_matrices.load("BLOSUM62") aligner.open_gap_score = open_gap_score aligner.extend_gap_score = extend_gap_score try: alignments = list(aligner.align(sequence_1, sequence_2)) except ValueError as e: logging.warning('Needleman-Wunsch alignment failed due to wrong alphabet:\n{}'.format(e)) return None alignments.sort(key=lambda x: x.score, reverse=True) aligned_indices = alignments[0].aligned alignment_dict = {} for query_chunk, target_chunk in zip(*aligned_indices): for query_index, target_index in zip(range(*query_chunk), range(*target_chunk)): alignment_dict[target_index] = query_index return alignment_dict