def nw_hamming_metric(s1, s2, matrix='blosum62', open=3, extend=3): """Function applying Parasail's Needleman-Wuncsh Algorithm to align and compute a Hamming Distance between any two sequences: number of mismatched positions. Gaps count as a mismatch. Penalties and matrix are used for alignment purposes, not in the distance calculation. Parameters ---------- s1: string string containing amino acid letters s2: string string containing amino acid letters matrix : str Attribute of parasail that names a substitution matrix Returns ------- D : float distance between strings (Hamming Distance: number of mismatched positions) Notes ----- .. code-block:: python xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) hamming_distance = len(xy_t.traceback.comp)-xy.matches return hamming_distance""" p_matrix = getattr(parasail, matrix) xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix) D = len(xy_t.traceback.comp)-xy.matches return D
def get_cigar(s1, s2): import parasail if kind == "local": result = parasail.sw_trace(s1, s2, 101, 10, parasail.pam100) elif kind == "semi-global": result = parasail.sg_trace(s1, s2, 101, 10, parasail.pam100) elif kind == "global": result = parasail.nw_trace(s1, s2, 101, 10, parasail.pam100) else: raise ValueError( "The kind of alignment must be global, semi-global, or local.") output = [] for i in result.cigar.seq: #print(result.cigar.decode_len(i), result.cigar.decode_op(i).decode()) output += [ result.cigar.decode_len(i), result.cigar.decode_op(i).decode() ] for i in output[:0]: float(i) total_base = sum(output[:0]) # I'll convert to the canonical way of showing these things (e.g. SAMFILES - pysam) # CIGAR type ct = output[1::2] # CIGAR length cl = output[::2] output = list(zip(ct, cl)) return output
def align(sequence1: str, sequence2: str): """ Run the local pairwise alignment of two strings and return alignment data. """ result = parasail.nw_trace(sequence1, sequence2, GAP_OPEN_PENALTY, GAP_EXTEND_PENALTY, parasail.blosum80) cigar_text = result.cigar.decode result = result.traceback.ref, result.traceback.query, result.score return result
def align_affine(query: str, target: str, match_score: int, mismatch_cost: int, gap_open_cost: int, gap_extension_cost: int) -> Cigar: assert min(match_score, mismatch_cost, gap_open_cost, gap_extension_cost) >= 0, "Specify positive integers" return Cigar( parasail.nw_trace( query, target, gap_open_cost, gap_extension_cost, parasail.matrix_create( "ACGT", match_score, -mismatch_cost)).cigar.decode.decode('utf-8'))
def align( self, x, y, open_penalty, extend_penalty, matrix=parasail.blosum62, ): """ Function that takes two strings and returns a parasail result object. Uses: parasail.nw_trace() - Needleman-Wunsch Global Alignment parasail.blossum62 - blossum62 substitution matrix The blosum62 matrix may warrant scrutiny: Selecting the Right Similarity-Scoring Matrix https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3848038/ Parameters ---------- x : string Amino acid string passed to arg s1 in parasail.sw_trace y : string Amino acid string passed to arg s2 in parasail.sw_trace open_penalty: int gap opening penalty extent_penalty: int gap extension penalty matrix: parasail.blossum62 - blossum62 substitution matrix Returns ------- r : parasail.bindings_v2.Result Parasail result from sw_trace alignment r.traceback.query, Raises ------ """ r = parasail.nw_trace(s1=x, s2=y, extend=extend_penalty, open=open_penalty, matrix=matrix) return (r)
def pairwise_nw_trace_align(ref_seq, query_seq, gap_open=10, gap_extension=1, matrix=parasail.blosum62): """ :param ref_seq: :param query_seq: :param gap_open: Default 10 :param gap_extension: Default 1 :param matrix: Default BLOSUM62 :return: parasail result (includes a cigar which can be decoded) """ result = parasail.nw_trace(query_seq, ref_seq, gap_open, gap_extension, matrix) logging.debug("Parasail result %s" % result.cigar.decode) return result
def hm_metric(s1, s2, matrix='blosum62', open=3, extend=3): """ Function applying Parasail's Needleman-Wuncsh Algorithm to allign and get a Hamming Distance between any two sequences: number of mismatched positions Parameters ---------- s1: string string containing amino acid letters s2: string string containing amino acid letters Returns ------- D : float distance between strings (Hamming Distance: number of mismatched positions) Notes ----- .. code-block:: python xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) hamming_distance = len(xy_t.traceback.comp)-xy.matches return hamming_distance """ p_matrix = getattr(parasail, matrix) xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=p_matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=p_matrix) D = len(xy_t.traceback.comp) - xy.matches return D
def hm_matches(s1, s2, matrix=parasail.blosum62, open=3, extend=3): xy = parasail.nw_stats(s1, s2, open=open, extend=extend, matrix=matrix) xy_t = parasail.nw_trace(s1, s2, open=open, extend=extend, matrix=matrix) return xy.matches