def set_fwr4_columns(record, database): j_call = record["j_call"] if not j_call or record["locus"] not in ALLOWED_LOCI: return cdr3_ref_end = database.j_cdr3_end(record["j_call"], record["locus"]) cdr3_query_end = record["cdr3_end"] if cdr3_ref_end is None or not cdr3_query_end: return fwr4_nt = record["sequence"][cdr3_query_end : record["j_sequence_end"]] # This overwrites some existing columns record["fwr4_start"] = cdr3_query_end + 1 record["fwr4_end"] = record["j_sequence_end"] record["fwr4"] = fwr4_nt record["fwr4_aa"] = nt_to_aa(fwr4_nt) # Compute FR4 mutation rate on nucleotide level germline = database.j[record["j_call"]][ record["j_germline_start"] - 1 : record["j_germline_end"] ] dist = edit_distance(germline, fwr4_nt) record["FR4_SHM"] = 100.0 * dist / len(germline) # Compute FR4 amino acid mutation rate sequence_aa = record["fwr4_aa"] germline_aa = nt_to_aa(germline) dist = edit_distance(germline_aa, sequence_aa) record["J_aa_mut"] = 100.0 * dist / len(germline_aa)
def assert_banded(s, t, maxdiff): banded_dist = edit_distance(s, t, maxdiff=maxdiff) true_dist = edit_distance(s, t) if true_dist > maxdiff: assert banded_dist > maxdiff else: assert banded_dist == true_dist
def distances(sequences, band=0.2): """ Compute all pairwise edit distances and return a square matrix. Entry [i,j] in the matrix is the edit distance between sequences[i] and sequences[j]. """ # Pre-compute distances between unique sequences unique_sequences = list(set(sequences)) unique_distances = dict() # maps (seq1, seq2) tuples to edit distance maxdiff = max((int(len(s) * band) for s in sequences), default=0) # TODO double-check this for i, s in enumerate(unique_sequences): for j, t in enumerate(unique_sequences): if i < j: dist = min(maxdiff+1, edit_distance(s, t, maxdiff=maxdiff)) unique_distances[(t, s)] = dist unique_distances[(s, t)] = dist # Fill the result matrix m = np.zeros((len(sequences), len(sequences)), dtype=float) for i, s in enumerate(sequences): for j, t in enumerate(sequences): if i < j: d = 0 if s == t else unique_distances[(s, t)] m[j, i] = m[i, j] = d return m
def fr4_aa_mutation_rate(self): if 'J' not in self.hits: return None j_subject_id = self.hits['J'].subject_id if self.chain not in self.CHAINS: return None cdr3_ref_end = self._database.j_cdr3_end(j_subject_id, self.CHAINS[self.chain]) if cdr3_ref_end is None: return None cdr3_query_end = self.hits['J'].query_position( reference_position=cdr3_ref_end) if cdr3_query_end is None: return None query = self.full_sequence[cdr3_query_end:self.hits['J'].query_end] try: query_aa = nt_to_aa(query) except ValueError: return None ref = self._database.j[j_subject_id][cdr3_ref_end:self.hits['J']. subject_end] try: ref_aa = nt_to_aa(ref) except ValueError: return None if not ref_aa: return None return 100. * edit_distance(ref_aa, query_aa) / len(ref_aa)
def augment_group(table, v_shm_threshold=5, suffix='_mindiffrate'): """ Add columns to the given table that contain percentage difference of VDJ_nt, VDJ_aa, CDR3_nt, CDR3_aa to the least mutated (in terms of V_SHM) sequence in this group. """ columns = ['CDR3_nt', 'CDR3_aa', 'VDJ_nt', 'VDJ_aa'] i = table.columns.get_loc('barcode') # insert before this column for column in columns[::-1]: table.insert(i, column + suffix, None) if table.empty: return table # Find row whose V is least mutated root = table.loc[table['V_SHM'].idxmin()] if root['V_SHM'] > v_shm_threshold: return table for column in columns: root_seq = root[column] table[column + suffix] = table[column].apply(lambda s: round(edit_distance(root_seq, s, maxdiff=int(0.2 * len(root_seq))) / len(root_seq) * 100., 1) ) return table
def should_discard(self, reference: Candidate, candidate: Candidate, _same_gene: bool): """ Compare a candidate to a reference candidate and decide whether it should be discarded. :param reference: The reference candidate. The decision this function makes is not about that one. :param candidate: The candidate on which to decide. :param dist: Edit distance between candidates :return: False if the candidate should be kept. Otherwise, the candidate shoud be discarded and a non-empty string with a reason describing why is returned. """ # When computing edit distance between the two sequences, ignore the # bases in the 3' end that correspond to the CDR3 s_no_cdr3 = reference.sequence[:reference.cdr3_start] t_no_cdr3 = candidate.sequence[:candidate.cdr3_start] if len(s_no_cdr3) != len(t_no_cdr3): t_prefix = t_no_cdr3[:len(s_no_cdr3)] t_suffix = t_no_cdr3[-len(s_no_cdr3):] dist_prefix = edit_distance(s_no_cdr3, t_prefix, 1) dist_suffix = edit_distance(s_no_cdr3, t_suffix, 1) # TODO prefix and suffix? dist_no_cdr3 = min(dist_prefix, dist_suffix) else: dist_no_cdr3 = edit_distance(s_no_cdr3, t_no_cdr3, 1) if dist_no_cdr3 > 1: # Cross-mapping is unlikely if the edit distance is larger than 1 return None if not reference.is_database or not candidate.is_database: # Cross-mapping can only occur if both sequences are in the database return None total_count = (reference.cluster_size + candidate.cluster_size) if total_count == 0: return False ratio = candidate.cluster_size / total_count if candidate.cluster_size_is_accurate and ratio < self._ratio: # candidate is probably a cross-mapping artifact of the higher-expressed ref return f'xmap_ratio={ratio:.4f},other={reference.name}' return False
def merged(self, s, t): """ Merge two sequences if they overlap. If they should not be merged, None is returned. """ # TODO copy-and-pasted from germlinefilter # # Check allele ratio. Somewhat similar to cross-mapping, but # this uses sequence names to decide whether two genes can be # alleles of each other and the ratio is between the CDR3s_exact # values if self._allele_ratio and is_same_gene(s.name, t.name): for u, v in [(s, t), (t, s)]: if v.unique_CDR3 == 0: continue ratio = u.unique_CDR3 / v.unique_CDR3 if ratio < self._allele_ratio: # logger.info('Allele ratio %.4f too low for %r compared to %r', # ratio, u.name, v.name) return v if self._cross_mapping_ratio: # When checking for cross mapping, ignore overhanging bases in the 5' end. # Example: # ---ACTACGACTA... # XXX|||||X|||| # ATTACTACTACTA... if len(t.sequence) < len(s.sequence): t, s = s, t # s is now the shorter sequence t_seq = t.sequence[len(t.sequence) - len(s.sequence):] s_seq = s.sequence dist = edit_distance(s_seq, t_seq, 1) if dist > 1: return None total_occ = (s.exact_occ + t.exact_occ) if total_occ == 0: return None for u, v in [(s, t), (t, s)]: ratio = u.exact_occ / total_occ if ratio < self._cross_mapping_ratio: # u is probably a cross-mapping artifact of the higher-expressed v logger.info( '%r is a cross-mapping artifact of %r (ratio %.4f)', u.name, v.name, ratio) return v return None
def closest(self, sequence): """ Search for the whitelist sequence that is closest to the given sequence. Return tuple (distance, name). """ if sequence in self._sequences: return 0, self._sequences[sequence] mindist = len(sequence) distances = [] for seq, name in self._sequences.items(): ed = edit_distance(seq, sequence, maxdiff=mindist) distances.append((ed, name)) if ed == 1: # We know ed does not get smaller because the # 'sequence in whitelist' check # above covers that return ed, name mindist = min(mindist, ed) distance, name = min(distances) return distance, name
def set_shm_columns(record, database): """ Compute SHM (actually mutation rate on nucleotide level) for all regions on V """ for airr_col, region in ( ("fwr1", "FR1"), ("cdr1", "CDR1"), ("fwr2", "FR2"), ("cdr2", "CDR2"), ("fwr3", "FR3"), ): start = record[airr_col + "_start"] end = record[airr_col + "_end"] if start is None or end is None: record[region + "_SHM"] = None continue sequence = record["sequence"][start - 1 : end] germline = database.v_regions_nt[record["v_call"]].get(region) dist = edit_distance(germline, sequence) record[region + "_SHM"] = 100.0 * dist / len(germline)
def main(args): if args.database: with dnaio.open(args.database) as fr: database = list(fr) logger.info('Read %d sequences from %r', len(database), args.database) else: database = None column = {'V': 'V_nt', 'J': 'J_nt', 'D': 'D_region'}[args.gene] other = 'V' if args.gene in ('D', 'J') else 'J' other_gene = other.lower() + '_call' other_errors = other + '_errors' table = read_table(args.table, usecols=[ 'count', 'v_call', 'd_call', 'j_call', 'V_errors', 'J_errors', 'J_covered', column, 'cdr3' ]) logger.info('Table with %s rows read', len(table)) if args.j_coverage is None and args.gene == 'J': args.j_coverage = 90 if args.j_coverage: table = table[table['J_covered'] >= args.j_coverage] logger.info('Keeping %s rows that have J_covered >= %s', len(table), args.j_coverage) if args.perfect_matches: table = table[table[other_errors] == 0] logger.info('Keeping %s rows that have no %s mismatches', len(table), other) if args.merge is None: args.merge = args.gene == 'D' if args.min_count is None: args.min_count = { 'J': 1, 'D': 10, 'V': 100 }[args.gene] # TODO J is fine, but are D and V? if args.gene == 'D': candidates = sequence_candidates(table, column, minimum_length=args.d_core_length, core=args.d_core) elif args.gene == 'J': candidates = sequence_candidates( table, column, minimum_length=MINIMUM_CANDIDATE_LENGTH) else: candidates = sequence_candidates( table, column, minimum_length=MINIMUM_CANDIDATE_LENGTH) candidates = list(candidates) logger.info('Collected %s unique %s sequences', len(candidates), args.gene) # Add whitelisted sequences if database: whitelist = make_whitelist(table, database, args.gene, args.allele_ratio) missing_whitelisted = set(whitelist) - set(c.sequence for c in candidates) for sequence in missing_whitelisted: candidates.append(Candidate(None, sequence)) logger.info('Added %d whitelisted sequence%s', len(missing_whitelisted), 's' if len(missing_whitelisted) != 1 else '') candidates = list(discard_substring_occurrences(candidates)) logger.info( 'Removing candidate sequences that occur within others results in %s candidates', len(candidates)) candidates = [ candidate for candidate in candidates if 'N' not in candidate.sequence ] logger.info('Removing candidates containing "N" results in %s candidates', len(candidates)) if args.merge: logger.info('Merging overlapping sequences ...') # Merge candidate sequences that overlap. If one candidate is longer than # another, this is typically a sign that IgBLAST has not extended the # alignment long enough. merger = OverlappingSequenceMerger() for candidate in candidates: merger.add(candidate) logger.info('After merging overlapping %s sequences, %s remain', args.gene, len(merger)) candidates = list(merger) logger.info('%d candidates', len(candidates)) del table # Assign names etc. if database: for candidate in candidates: distances = [(edit_distance(db.sequence, candidate.sequence), db) for db in database] candidate.db_distance, closest = min(distances, key=lambda x: x[0]) candidate.db_name = closest.name if candidate.db_distance == 0: candidate.name = closest.name else: # Exact db sequence not found, is there one that contains # this candidate as a substring? for db_record in database: index = db_record.sequence.find(candidate.sequence) if index == -1: continue if args.gene == 'D': start = db_record.sequence.find(candidate.sequence) prefix = db_record.sequence[:start] suffix = db_record.sequence[start + len(candidate.sequence):] candidate.missing = '{}...{}'.format(prefix, suffix) else: # Replace this record with the full-length version candidate.sequence = db_record.sequence candidate.db_distance = 0 candidate.name = db_record.name break else: candidate.name = unique_name(closest.name, candidate.sequence) else: for candidate in candidates: candidate.name = unique_name(args.gene, candidate.sequence) logger.info('Counting occurrences ...') if args.gene == 'D': search_columns = ['np1', 'D_region', 'np2'] elif args.gene == 'J': search_columns = ['np2', 'J_nt'] else: search_columns = ['sequence'] candidates = count_occurrences(candidates, args.table, search_columns, other_gene, other_errors, args.merge, args.perfect_matches) # Filter by allele ratio if args.allele_ratio or args.cross_mapping_ratio: arm = AlleleRatioMerger(args.allele_ratio, args.cross_mapping_ratio) arm.extend(candidates) candidates = list(arm) logger.info( 'After filtering by allele ratio and/or cross-mapping ratio, %d candidates remain', len(candidates)) candidates = sorted(candidates, key=lambda c: c.name) candidates = [ c for c in candidates if c.exact_occ >= args.min_count or c.db_distance == 0 ] print_table(candidates, other_gene, missing=args.gene == 'D') if args.fasta: with open(args.fasta, 'w') as f: for candidate in sorted(candidates, key=lambda r: r.name): print('>{}\n{}'.format(candidate.name, candidate.sequence), file=f) logger.info('Wrote %d genes', len(candidates))
def _percent_identity(self): # FIXME This is not quite how IgBLAST computes percent identity if not self.nt_reference or not self.nt_sequence: return None dist = edit_distance(self.nt_reference, self.nt_sequence) return 100. - 100. * dist / len(self.nt_reference)
def test_edit_distance(): assert edit_distance('', '') == 0 assert edit_distance('', 'A') == 1 assert edit_distance('A', 'B') == 1 assert edit_distance('A', 'A') == 0 assert edit_distance('A', 'AB') == 1 assert edit_distance('BA', 'AB') == 2 for s, t in STRING_PAIRS + RANDOM_STRING_PAIRS: assert edit_distance(s, '') == len(s) assert edit_distance('', s) == len(s) assert edit_distance(s, t) == edit_distance(t, s) assert edit_distance(s, t) == py_edit_distance(s, t)
def __call__(self, args): """ Discover new V genes. args is a tuple (gene, group) gene -- name of the gene assignments -- a pandas DataFrame with the assignments to the gene """ gene, assignments = args self.set_random_seed(gene) siblings = SiblingMerger() for sibling in self._collect_siblings(gene, assignments): siblings.add(sibling) candidates = [] for sibling_info in siblings: sibling = sibling_info.sequence n_bases = sibling.count('N') if n_bases > self.max_n_bases: logger.debug('Sibling %s has too many N bases', sibling_info.name) continue # Sequence without the CDR3-covering part sibling_no_cdr3 = sibling[:self._guess_cdr3_start(assignments)] group_exact_v = assignments[assignments.V_no_CDR3 == sibling_no_cdr3] group_full_exact_v = assignments[ assignments['VDJ_nt'].str.startswith(sibling, na=False)] groups = ( ('window', sibling_info.group), ('exact', group_exact_v), ('full_exact', group_full_exact_v), ) del sibling_no_cdr3 # self.cdr3_counts are CDR3 counts of all CDR3s in the entire table. # We restrict this here to the counts for CDR3s belonging to # clusters other than the current one. other_cdr3_counts = self.cdr3_counts - Counter( s for s in sibling_info.group.cdr3 if s) info = dict() for key, group in groups: cdr3_counts = Counter(s for s in group.cdr3 if s) unique_cdr3 = len(cdr3_counts) shared_cdr3_ratio = safe_divide( len(other_cdr3_counts & cdr3_counts), unique_cdr3) unique_j = len(set(s for s in group.j_call if s)) clonotypes = self.count_clonotypes(group) unique_d = self.count_unique_d(group) unique_barcodes = self.count_unique_barcodes(group) count = len(group.index) read_names = list(group.sequence_id) info[key] = Groupinfo(count=count, unique_D=unique_d, unique_J=unique_j, unique_CDR3=unique_cdr3, shared_CDR3_ratio=shared_cdr3_ratio, clonotypes=clonotypes, read_names=read_names, unique_barcodes=unique_barcodes) if gene in self.database: database_diff = edit_distance(sibling, self.database[gene]) database_changes = describe_nt_change(self.database[gene], sibling) else: database_diff = None database_changes = None # Build the Candidate sequence_id = gene if database_diff == 0 else unique_name( gene, sibling) chain = self._guess_chain(sibling_info.group) cdr3_start = self._guess_cdr3_start(sibling_info.group) ratio = safe_divide(info['exact'].count, info['exact'].unique_CDR3) # Apply some very light filtering on non-database sequences if database_diff > 0 and info['exact'].count < 2: continue candidate = Candidate( name=sequence_id, source=gene, chain=chain, cluster=sibling_info.name, cluster_size=info['window'].count, Js=info['window'].unique_J, CDR3s=info['window'].unique_CDR3, exact=info['exact'].count, full_exact=info['full_exact'].count, barcodes_exact=info['exact'].unique_barcodes, Ds_exact=info['exact'].unique_D, Js_exact=info['exact'].unique_J, CDR3s_exact=info['exact'].unique_CDR3, clonotypes=info['exact'].clonotypes, CDR3_exact_ratio=ratio, CDR3_shared_ratio=info['exact'].shared_CDR3_ratio, N_bases=n_bases, database_diff=database_diff, database_changes=database_changes, has_stop=has_stop(sibling), CDR3_start=cdr3_start, consensus=sibling, read_names=info['exact'].read_names, ) candidates.append(candidate) return candidates
def linked(s, t): return edit_distance(s, t, distance) <= distance