def compare(a, b): """Return cost of comparing a to b""" l = min(len(a.sequence), len(b.sequence)) length_diff = max(len(a.sequence), len(b.sequence)) - l dist_prefixes = hamming_distance(a.sequence[:l], b.sequence[:l]) dist_suffixes = hamming_distance(a.sequence[-l:], b.sequence[-l:]) return 5 * min(dist_prefixes, dist_suffixes) + length_diff
def naive_has_similar(self, s, distance): for t in self.strings: if len(t) != len(s): continue if hamming_distance(t, s) <= distance: return True return False
def is_similar_with_junction(s, t, mismatches, cdr3_core): """ Return whether strings s and t have at most the given number of mismatches *and* have at least one identical junction. """ # TODO see issue #81 if len(s) != len(t): return False if 0 < mismatches < 1: delta = cdr3_core.start if cdr3_core is not None else 0 distance_ok = hamming_distance(s, t) <= (len(s) - delta) * mismatches else: distance_ok = hamming_distance(s, t) <= mismatches if cdr3_core is None: return distance_ok return distance_ok and ((s[:cdr3_core.start] == t[:cdr3_core.start]) or (s[cdr3_core.stop:] == t[cdr3_core.stop:]))
def is_similar(s, t): # m = max_hamming if '-' in s or '-' in t: # Remove suffix and/or prefix where sequences do not overlap s = s.lstrip('-') t = t[-len(s):] s = s.rstrip('-') if len(s) < min_overlap: return False t = t[:len(s)] # TODO allowed Hamming distance should be reduced relative to the overlap length # m = max_hamming * len(s) / len(original_length_of_s) return hamming_distance(s, t) <= max_hamming
def print_similar(a, b, colored: bool): l = min(len(a.sequence), len(b.sequence)) dist_prefixes = hamming_distance(a.sequence[:l], b.sequence[:l]) dist_suffixes = hamming_distance(a.sequence[-l:], b.sequence[-l:]) if dist_prefixes <= dist_suffixes: a_prefix = '' b_prefix = '' a_common = a.sequence[:l] b_common = b.sequence[:l] a_suffix = a.sequence[l:] b_suffix = b.sequence[l:] else: a_prefix = a.sequence[:-l] b_prefix = b.sequence[:-l] a_common = a.sequence[-l:] b_common = b.sequence[-l:] a_suffix = '' b_suffix = '' s = format_indel(a_prefix, b_prefix, colored) edits = [] for i, (ac, bc) in enumerate(zip(a_common, b_common)): if ac != bc: if colored: s = '{' + red(ac) + ' → ' + green(bc) + '}' else: s = '{' + ac + ' → ' + bc + '}' edits.append(s) else: edits.append(ac) s += ''.join(edits) s += format_indel(a_suffix, b_suffix, colored) print('~', a.name, '--', b.name) print(s) print()
def _aa_mutations(self): # Earlier versions of this code used edit distance to compute the number of mutations, # but some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that # reference and query lengths are identical, we can filter out these cases (and use # Hamming distance to get some speedup) if (not self.aa_reference or not self.aa_sequence or len(self.nt_sequence) != len(self.nt_reference)): return None dist = hamming_distance(self.aa_reference, self.aa_sequence) # If the mutation rate is still obviously too high, assume something went # wrong and ignore the computed value if dist / len(self.aa_reference) >= 0.8: return None return dist
def set_aa_mut_columns(record, database): """ Compute amino acid mutation rate for all regions on V and also for V itself as the sum of the regions (that is, excluding the CDR3) """ total_length = 0 total_dist = 0 n_regions = 0 for airr_col, region in ( ("fwr1", "FR1"), ("cdr1", "CDR1"), ("fwr2", "FR2"), ("cdr2", "CDR2"), ("fwr3", "FR3"), ): record[region + "_aa_mut"] = None start = record[airr_col + "_start"] end = record[airr_col + "_end"] if start is None or end is None: continue sequence_aa = nt_to_aa(record["sequence"][start - 1 : end]) germline_aa = database.v_regions_aa[record["v_call"]].get(region) if germline_aa is None: continue # Some FR1 alignments are reported with a frameshift by IgBLAST. By requiring that # reference and query lengths are identical, we can filter out these cases (and use # Hamming distance to get some speedup) if len(germline_aa) != len(sequence_aa): continue dist = hamming_distance(germline_aa, sequence_aa) mut_aa = dist / len(germline_aa) if mut_aa >= 0.8: # assume something went wrong continue total_dist += dist n_regions += 1 total_length += len(germline_aa) record[region + "_aa_mut"] = 100.0 * mut_aa if n_regions == 5: record["V_aa_mut"] = 100.0 * total_dist / total_length else: record["V_aa_mut"] = None
def naive_find_all_similar(self, s, distance): for t in self.strings: if len(t) != len(s): continue if hamming_distance(t, s) <= distance: yield t
def naive_has_similar(t): for s in strings: if hamming_distance(s, t) <= dist: return True return False
def linked(s, t): return hamming_distance(s, t) <= mismatches
def test_hamming_distance_incorrect_length(): with pytest.raises(IndexError): hamming_distance('A', 'BC')
def test_hamming_distance(): assert hamming_distance('', '') == 0 assert hamming_distance('A', 'A') == 0 assert hamming_distance('HELLO', 'HELLO') == 0 assert hamming_distance('ABC', 'DEF') == 3 assert hamming_distance('ABCXDEF', 'ABCYDEF') == 1