def get_clusters_to_merge(): smallest_min_distance, clusters_to_merge = None, None n_skipped = 0 for clust_a, clust_b in itertools.combinations( clusters, 2 ): # find the two clusters which contain the pair of sequences which are closest in hamming fraction (skipping cluster pairs that would make a cluster that's too big) if len(clust_a) + len( clust_b ) > max_per_cluster and not glomerate.merge_whatever_you_got: # merged cluster would be too big, so look for smaller (albeit further-apart) things to merge n_skipped += 1 continue min_distance = None # find the smallest hamming distance between any two sequences in the two clusters for query_a in clust_a: for query_b in clust_b: joint_key = query_a + ';' + query_b #';'.join([query_a, query_b]) if joint_key not in distances: distances[joint_key] = utils.hamming_fraction( naive_seqs[query_a], naive_seqs[query_b]) distances[query_b + ';' + query_a] = distances[ joint_key] # also add with key in reverse order, in case we run into the pair that way later on if min_distance is None or distances[ joint_key] < min_distance: min_distance = distances[joint_key] if smallest_min_distance is None or min_distance < smallest_min_distance: smallest_min_distance = min_distance clusters_to_merge = (clust_a, clust_b) if debug and n_skipped > 0: print ' skipped: %d ' % n_skipped return clusters_to_merge
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None, debug=False): # NOTE set <n_components> to None to run plain kmeans, without mds TODO clean this up start = time.time() assert n_clusters is not None if 'sklearn' not in sys.modules: from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> if debug: print 'align' seqfos = utils.align_many_seqs(seqfos) if debug: print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) random_state = numpy.random.RandomState(seed=seed) pos = None if n_components is not None: if debug: print ' mds' mds = sys.modules['sklearn'].manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ if debug: print ' kmeans clustering with %d clusters' % n_clusters kmeans = sys.modules['sklearn'].cluster.KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos if pos is not None else similarities) pcvals = {seqfos[iseq]['name'] : pos[iseq] if pos is not None else None for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} partition = utils.group_seqs_by_value(pcvals.keys(), lambda q: labels[q]) if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) if debug: print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) if debug: print ' kmeans time %.1f' % (time.time() - start) return partition
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line) inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line) left_hack_add_on = '' right_hack_add_on = '' if len(true_line['seq']) > len(line['seq']): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? start = true_line['seq'].find(line['seq']) assert start >= 0 end = len(line['seq']) + start left_hack_add_on = true_line['seq'][: start] right_hack_add_on = true_line['seq'][ end :] # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) if debug: print ' adding to inferred naive seq' # if restrict_to_region == '': # print ' before', inferred_naive_seq if padfo is not None: # remove N padding from the inferred sequence inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] if padfo['padright'] > 0: inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] # if restrict_to_region == '': # print ' after ', inferred_naive_seq bounds = None if restrict_to_region != '': bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line) # get the bounds of this *true* region true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if debug: print restrict_to_region, 'region, bounds', bounds print ' true ', true_naive_seq print ' infer', inferred_naive_seq if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None): print '%s not testing this after moving these imports down here' % utils.color('red', 'hey') from sklearn import manifold # these are both slow af to import, even on local ssd from sklearn.cluster import KMeans if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos): raise Exception('duplicate sequence ids in <seqfos>') print 'align' if not aligned: # NOTE unlike the bios2mds version above, this modifies <seqfos> seqfos = utils.align_many_seqs(seqfos) print ' distances' # translations = string.maketrans('ACGT-', '01234') # def convert(seq): # return [int(c) for c in seq.translate(translations)] # converted_seqs = [convert(x['seq']) for x in seqfos] # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming') # similarities = scipy.spatial.distance.squareform(similarities) similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))]) print ' mds' random_state = numpy.random.RandomState(seed=seed) mds = manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs) pos = mds.fit_transform(similarities) # pos = mds.fit(similarities).embedding_ print ' kmeans' kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos) pcvals = {seqfos[iseq]['name'] : pos[iseq] for iseq in range(len(seqfos))} labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))} def keyfunc(q): # should really integrate this with utils.collapse_naive_seqs()/utils.split_partition_with_criterion() return labels[q] partition = [list(group) for _, group in itertools.groupby(sorted(pcvals, key=keyfunc), key=keyfunc)] if plotdir is not None: utils.prep_dir(plotdir, wildlings=['*.svg']) print ' plot' plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels) return partition
def plot_within_vs_between_hists(self, partition, annotations, base_plotdir): classes = self.get_cdr3_length_classes(partition, annotations) overall_distances = {'within' : [mut_freq for info in annotations.values() for mut_freq in info['mut_freqs']], 'between' : []} sub_distances = {} def nseq(cl): return annotations[':'.join(cl)]['naive_seq'] for cdr3_length, clusters in classes.items(): # for each cdr3 length, loop over each pair of clusters that have that cdr3 length # NOTE/TODO I'm extremely unhappy that I have to put the naive seq length check here. But we pad cdr3 length subclasses to the same length during smith waterman, and by the time we get to here, in very rare cases the the cdr3 length has changed. hfracs = [utils.hamming_fraction(nseq(cl_a), nseq(cl_b)) for cl_a, cl_b in itertools.combinations(clusters, 2) if len(nseq(cl_a)) == len(nseq(cl_b))] # hamming fractions for each pair of clusters with this cdr3 length sub_distances[cdr3_length] = {'within' : [mut_freq for cluster in clusters for mut_freq in annotations[':'.join(cluster)]['mut_freqs']], 'between' : hfracs} overall_distances['between'] += hfracs self.plot_each_within_vs_between_hist(overall_distances, base_plotdir + '/overall', 'within-vs-between', '') for cdr3_length, subd in sub_distances.items(): self.plot_each_within_vs_between_hist(subd, base_plotdir + '/within-vs-between', 'cdr3-length-%d' % cdr3_length, 'CDR3 %d' % cdr3_length)
def from_same_lineage(cluster_id, uid): for clid in id_clusters[cluster_id]: # loop over seqs already in the cluster (it only has to match one of 'em) is_match = True for key in ('cdr3_length', 'v_gene', 'j_gene'): # same cdr3 length, v gene, and j gene if info[clid][key] != info[uid][key]: is_match = False break if not is_match: continue cl_seq = get_d_plus_insertions(clid) u_seq = get_d_plus_insertions(uid) if len(cl_seq) != len(u_seq): continue hamming_frac = utils.hamming_fraction(cl_seq, u_seq) if hamming_frac > 1. - threshold: continue return True # if we get to here, it's a match return False
def get_gene_pair_matrix(genelist, difftype): """ return matrix comparing all pairs of genes in <genelist> """ smatrix = [[] for _ in range(len(genelist))] for iv in range(len(genelist)): for jv in range(len(genelist)): if jv < iv + 1: smatrix[iv].append(0.) continue s1, s2 = [glfo['aligned-genes']['v'][genelist[index]] for index in [iv, jv]] # utils.color_mutants(s1, s2, print_result=True) if difftype == 'hamming': fraction, length = utils.hamming_fraction(s1, s2, return_len_excluding_ambig=True, extra_bases='.') elif difftype == 'indels': fraction = indel_difference_fraction(s1, s2) elif difftype == 'subs': fraction = substitution_difference_fraction(s1, s2) else: raise Exception('unexpected difftype %s' % difftype) smatrix[iv].append(fraction) return smatrix
def plot_within_vs_between_hists(self, partition, annotations, base_plotdir): classes = self.get_cdr3_length_classes(partition, annotations) overall_distances = { 'within': [ mut_freq for info in annotations.values() for mut_freq in info['mut_freqs'] ], 'between': [] } sub_distances = {} def nseq(cl): return annotations[':'.join(cl)]['naive_seq'] for cdr3_length, clusters in classes.items( ): # for each cdr3 length, loop over each pair of clusters that have that cdr3 length # NOTE/TODO I'm extremely unhappy that I have to put the naive seq length check here. But we pad cdr3 length subclasses to the same length during smith waterman, and by the time we get to here, in very rare cases the the cdr3 length has changed. hfracs = [ utils.hamming_fraction(nseq(cl_a), nseq(cl_b)) for cl_a, cl_b in itertools.combinations(clusters, 2) if len(nseq(cl_a)) == len(nseq(cl_b)) ] # hamming fractions for each pair of clusters with this cdr3 length sub_distances[cdr3_length] = { 'within': [ mut_freq for cluster in clusters for mut_freq in annotations[':'.join(cluster)]['mut_freqs'] ], 'between': hfracs } overall_distances['between'] += hfracs self.plot_each_within_vs_between_hist(overall_distances, base_plotdir + '/overall', 'within-vs-between', '') for cdr3_length, subd in sub_distances.items(): self.plot_each_within_vs_between_hist( subd, base_plotdir + '/within-vs-between', 'cdr3-length-%d' % cdr3_length, 'CDR3 %d' % cdr3_length)
def get_gene_set_mean_matrix(genesets, difftype): """ return matrix comparing the sets of genes in <genenames>, i.e. each entry is the average over all pairs of sequences in set 1 and set 2. """ setnames, genenames = genesets.keys(), genesets.values() n_sets = len(genenames) smatrix = [[] for _ in range(n_sets)] for iv in range(n_sets): for jv in range(n_sets): # if setnames[iv] != '3/OR15' or setnames[jv] != '4/OR15': # smatrix[iv].append(0.) # continue if jv < iv + 1: smatrix[iv].append(0.) continue # print ' %s %s' % (setnames[iv], setnames[jv]) seqs1 = [glfo['aligned-genes']['v'][g] for g in genenames[iv]] seqs2 = [glfo['aligned-genes']['v'][g] for g in genenames[jv]] total, nfractions = 0., 0 for is1 in range(len(seqs1)): # print ' ', utils.color_gene(genenames[iv][is1]) for is2 in range(len(seqs2)): # print ' ', utils.color_gene(genenames[jv][is2]), s1 = seqs1[is1] s2 = seqs2[is2] # utils.color_mutants(s1, s2, print_result=True, extra_str=' ') if difftype == 'hamming': fraction, length = utils.hamming_fraction(s1, s2, return_len_excluding_ambig=True, extra_bases='.') elif difftype == 'indels': fraction = indel_difference_fraction(s1, s2) elif difftype == 'subs': fraction = substitution_difference_fraction(s1, s2) else: raise Exception('unexpected difftype %s' % difftype) # print ' %.3f' % fraction total += fraction nfractions += 1 meanfraction = 0. if nfractions == 0 else float(total) / nfractions # print ' mean %.3f' % meanfraction smatrix[iv].append(meanfraction) return smatrix
def from_same_lineage(cluster_id, uid): for clid in id_clusters[ cluster_id]: # loop over seqs already in the cluster (it only has to match one of 'em) is_match = True for key in ('cdr3_length', 'v_gene', 'j_gene'): # same cdr3 length, v gene, and j gene if info[clid][key] != info[uid][key]: is_match = False break if not is_match: continue cl_seq = get_d_plus_insertions(clid) u_seq = get_d_plus_insertions(uid) if len(cl_seq) != len(u_seq): continue hamming_frac = utils.hamming_fraction(cl_seq, u_seq) if hamming_frac > 1. - threshold: continue return True # if we get to here, it's a match return False
def get_clusters_to_merge(): smallest_min_distance, clusters_to_merge = None, None n_skipped = 0 for clust_a, clust_b in itertools.combinations(clusters, 2): # find the two clusters which contain the pair of sequences which are closest in hamming fraction (skipping cluster pairs that would make a cluster that's too big) if len(clust_a) + len(clust_b) > max_per_cluster and not glomerate.merge_whatever_you_got: # merged cluster would be too big, so look for smaller (albeit further-apart) things to merge n_skipped += 1 continue min_distance = None # find the smallest hamming distance between any two sequences in the two clusters for query_a in clust_a: for query_b in clust_b: joint_key = query_a + ';' + query_b #';'.join([query_a, query_b]) if joint_key not in distances: distances[joint_key] = utils.hamming_fraction(naive_seqs[query_a], naive_seqs[query_b]) distances[query_b + ';' + query_a] = distances[joint_key] # also add with key in reverse order, in case we run into the pair that way later on if min_distance is None or distances[joint_key] < min_distance: min_distance = distances[joint_key] if smallest_min_distance is None or min_distance < smallest_min_distance: smallest_min_distance = min_distance clusters_to_merge = (clust_a, clust_b) if debug and n_skipped > 0: print ' skipped: %d ' % n_skipped return clusters_to_merge
def compare_partition_cachefiles(self, input_stype): """ NOTE only writing this for the ref input_stype a.t.m. """ # ---------------------------------------------------------------------------------------- def print_key_differences(vtype, refkeys, newkeys): print ' %s keys' % vtype if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0: if len(refkeys - newkeys) > 0: print utils.color('red', ' %d only in ref version' % len(refkeys - newkeys)) if len(newkeys - refkeys) > 0: print utils.color('red', ' %d only in new version' % len(newkeys - refkeys)) print ' %d in common' % len(refkeys & newkeys) else: print ' %d identical keys in new and ref cache' % len(refkeys) ptest = 'partition-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return # ---------------------------------------------------------------------------------------- print ' %s input partition cache file' % input_stype def readcache(fname): cache = {'naive_seqs' : {}, 'logprobs' : {}} with open(fname) as cachefile: reader = csv.DictReader(cachefile) for line in reader: if line['naive_seq'] != '': cache['naive_seqs'][line['unique_ids']] = line['naive_seq'] if line['logprob'] != '': cache['logprobs'][line['unique_ids']] = float(line['logprob']) return cache refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype]) newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype]) # work out intersection and complement refkeys, newkeys = {}, {} for vtype in ['naive_seqs', 'logprobs']: refkeys[vtype] = set(refcache[vtype].keys()) newkeys[vtype] = set(newcache[vtype].keys()) print_key_differences(vtype, refkeys[vtype], newkeys[vtype]) hammings = [] n_hammings = 0 n_different_length, n_big_hammings = 0, 0 hamming_eps = 0. vtype = 'naive_seqs' for uids in refkeys[vtype] & newkeys[vtype]: refseq = refcache[vtype][uids] newseq = newcache[vtype][uids] n_hammings += 1 if len(refseq) == len(newseq): hamming_fraction = utils.hamming_fraction(refseq, newseq) if hamming_fraction > hamming_eps: n_big_hammings += 1 hammings.append(hamming_fraction) else: n_different_length += 1 diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings) mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.) if n_big_hammings > 0: diff_hfracs_str = utils.color('red', diff_hfracs_str) mean_hfrac_str = utils.color('red', mean_hfrac_str) abs_delta_logprobs = [] n_delta_logprobs = 0 n_big_delta_logprobs = 0 logprob_eps = 1e-5 vtype = 'logprobs' for uids in refkeys[vtype] & newkeys[vtype]: refval = refcache[vtype][uids] newval = newcache[vtype][uids] n_delta_logprobs += 1 abs_delta_logprob = abs(refval - newval) if abs_delta_logprob > logprob_eps: n_big_delta_logprobs += 1 abs_delta_logprobs.append(abs_delta_logprob) diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs) mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.) if n_big_delta_logprobs > 0: diff_logprob_str = utils.color('red', diff_logprob_str) mean_logprob_str = utils.color('red', mean_logprob_str) print ' fraction different mean abs difference among differents' print ' naive seqs %s %s (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str) print ' log probs %s %s' % (diff_logprob_str, mean_logprob_str) if n_different_length > 0: print utils.color('red', ' %d different length' % n_different_length)
def compare_partition_cachefiles(self, input_stype): """ NOTE only writing this for the ref input_stype a.t.m. """ ptest = 'partition-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return print '%s input partition cache file' % input_stype def readcache(fname): cache = {} with open(fname) as cachefile: reader = csv.DictReader(cachefile) for line in reader: cache[line['unique_ids']] = {'naive_seq' : line['naive_seq'], 'logprob' : float(line['logprob'])} return cache refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype]) newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype]) # work out intersection and complement refkeys = set(refcache.keys()) newkeys = set(newcache.keys()) if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0: if len(refkeys - newkeys) > 0: print utils.color('red', ' %d only in ref version' % len(refkeys - newkeys)) if len(newkeys - refkeys) > 0: print utils.color('red', ' %d only in new version' % len(newkeys - refkeys)) print ' %d in common' % len(refkeys & newkeys) else: print ' %d identical keys in new and ref cache' % len(refkeys) hammings, delta_logprobs = [], [] n_hammings, n_delta_logprobs = 0, 0 n_different_length, n_big_hammings, n_big_delta_logprobs = 0, 0, 0 hamming_eps = 0. logprob_eps = 1e-5 for uids in refkeys & newkeys: refline = refcache[uids] newline = newcache[uids] if refline['naive_seq'] != '': n_hammings += 1 if len(refline['naive_seq']) == len(newline['naive_seq']): hamming_fraction = utils.hamming_fraction(refline['naive_seq'], newline['naive_seq']) if hamming_fraction > hamming_eps: n_big_hammings += 1 hammings.append(hamming_fraction) else: n_different_length += 1 if refline['logprob'] != '': n_delta_logprobs += 1 delta_logprob = abs(float(refline['logprob']) - float(newline['logprob'])) if delta_logprob > logprob_eps: n_big_delta_logprobs += 1 delta_logprobs.append(delta_logprob) diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings) mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.) if n_big_hammings > 0: diff_hfracs_str = utils.color('red', diff_hfracs_str) mean_hfrac_str = utils.color('red', mean_hfrac_str) diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs) mean_logprob_str = '%.6f' % (numpy.average(delta_logprobs) if len(delta_logprobs) > 0 else 0.) if n_big_delta_logprobs > 0: diff_logprob_str = utils.color('red', diff_logprob_str) mean_logprob_str = utils.color('red', mean_logprob_str) print ' fraction different mean difference among differents' print ' naive seqs %s %s (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str) print ' log probs %s %s' % (diff_logprob_str, mean_logprob_str) if n_different_length > 0: print utils.color('red', ' %d different length' % n_different_length)
def compare_partition_cachefiles(self, input_stype): """ NOTE only writing this for the ref input_stype a.t.m. """ ptest = "partition-" + input_stype + "-simu" if args.quick and ptest not in self.quick_tests: return print "%s partition cache file" % input_stype def readcache(fname): cache = {} with open(fname) as cachefile: reader = csv.DictReader(cachefile) for line in reader: cache[line["unique_ids"]] = {"naive_seq": line["naive_seq"], "logprob": float(line["logprob"])} return cache refcache = readcache(self.dirs["ref"] + "/" + self.cachefnames[input_stype]) newcache = readcache(self.dirs["new"] + "/" + self.cachefnames[input_stype]) # work out intersection and complement refkeys = set(refcache.keys()) newkeys = set(newcache.keys()) if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0: if len(refkeys - newkeys) > 0: print utils.color("red", " %d only in ref" % len(refkeys - newkeys)) if len(newkeys - refkeys) > 0: print utils.color("red", " %d only in new" % len(newkeys - refkeys)) print " %d in common" % len(refkeys & newkeys) else: print " %d identical keys in new and ref cache" % len(refkeys) hammings, delta_logprobs = [], [] n_hammings, n_delta_logprobs = 0, 0 n_different_length, n_big_hammings, n_big_delta_logprobs = 0, 0, 0 hamming_eps = 0.0 logprob_eps = 1e-5 for uids in refkeys & newkeys: refline = refcache[uids] newline = newcache[uids] if refline["naive_seq"] != "": n_hammings += 1 if len(refline["naive_seq"]) == len(newline["naive_seq"]): hamming_fraction = utils.hamming_fraction(refline["naive_seq"], newline["naive_seq"]) if hamming_fraction > hamming_eps: n_big_hammings += 1 hammings.append(hamming_fraction) else: n_different_length += 1 if refline["logprob"] != "": n_delta_logprobs += 1 delta_logprob = abs(float(refline["logprob"]) - float(newline["logprob"])) if delta_logprob > logprob_eps: n_big_delta_logprobs += 1 delta_logprobs.append(delta_logprob) diff_hfracs_str = "%3d / %4d" % (n_big_hammings, n_hammings) mean_hfrac_str = "%.3f" % (numpy.average(hammings) if len(hammings) > 0 else 0.0) if n_big_hammings > 0: diff_hfracs_str = utils.color("red", diff_hfracs_str) mean_hfrac_str = utils.color("red", mean_hfrac_str) diff_logprob_str = "%3d / %4d" % (n_big_delta_logprobs, n_delta_logprobs) mean_logprob_str = "%.6f" % (numpy.average(delta_logprobs) if len(delta_logprobs) > 0 else 0.0) if n_big_delta_logprobs > 0: diff_logprob_str = utils.color("red", diff_logprob_str) mean_logprob_str = utils.color("red", mean_logprob_str) print " fraction different mean difference among differents" print " naive seqs %s %s (hamming fraction)" % ( diff_hfracs_str, mean_hfrac_str, ) print " log probs %s %s" % (diff_logprob_str, mean_logprob_str) if n_different_length > 0: print utils.color("red", " %d different length" % n_different_length)
def compare_partition_cachefiles(self, input_stype): """ NOTE only writing this for the ref input_stype a.t.m. """ # ---------------------------------------------------------------------------------------- def print_key_differences(vtype, refkeys, newkeys): print ' %s keys' % vtype if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0: if len(refkeys - newkeys) > 0: print utils.color('red', ' %d only in ref version' % len(refkeys - newkeys)) if len(newkeys - refkeys) > 0: print utils.color('red', ' %d only in new version' % len(newkeys - refkeys)) print ' %d in common' % len(refkeys & newkeys) else: print ' %d identical keys in new and ref cache' % len(refkeys) ptest = 'partition-' + input_stype + '-simu' if args.quick and ptest not in self.quick_tests: return # ---------------------------------------------------------------------------------------- print ' %s input partition cache file' % input_stype def readcache(fname): cache = {'naive_seqs' : {}, 'logprobs' : {}} with open(fname) as cachefile: reader = csv.DictReader(cachefile) for line in reader: if line['naive_seq'] != '': cache['naive_seqs'][line['unique_ids']] = line['naive_seq'] if line['logprob'] != '': cache['logprobs'][line['unique_ids']] = float(line['logprob']) return cache refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype]) newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype]) # work out intersection and complement refkeys, newkeys = {}, {} for vtype in ['naive_seqs', 'logprobs']: refkeys[vtype] = set(refcache[vtype].keys()) newkeys[vtype] = set(newcache[vtype].keys()) print_key_differences(vtype, refkeys[vtype], newkeys[vtype]) hammings = [] n_hammings = 0 n_different_length, n_big_hammings = 0, 0 hamming_eps = 0. vtype = 'naive_seqs' for uids in refkeys[vtype] & newkeys[vtype]: refseq = refcache[vtype][uids] newseq = newcache[vtype][uids] n_hammings += 1 if len(refseq) == len(newseq): hamming_fraction = utils.hamming_fraction(refseq, newseq) if hamming_fraction > hamming_eps: n_big_hammings += 1 hammings.append(hamming_fraction) else: n_different_length += 1 diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings) mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.) if n_big_hammings > 0: diff_hfracs_str = utils.color('red', diff_hfracs_str) mean_hfrac_str = utils.color('red', mean_hfrac_str) abs_delta_logprobs = [] n_delta_logprobs = 0 n_big_delta_logprobs = 0 logprob_eps = 1e-5 vtype = 'logprobs' for uids in refkeys[vtype] & newkeys[vtype]: refval = refcache[vtype][uids] newval = newcache[vtype][uids] n_delta_logprobs += 1 abs_delta_logprob = abs(refval - newval) if abs_delta_logprob > logprob_eps: # print '%s %s ref %f new %f' % (vtype, uids, refval, newval) n_big_delta_logprobs += 1 abs_delta_logprobs.append(abs_delta_logprob) diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs) mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.) if n_big_delta_logprobs > 0: diff_logprob_str = utils.color('red', diff_logprob_str) mean_logprob_str = utils.color('red', mean_logprob_str) print ' fraction different mean abs difference among differents' print ' naive seqs %s %s (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str) print ' log probs %s %s' % (diff_logprob_str, mean_logprob_str) if n_different_length > 0: print utils.color('red', ' %d different length' % n_different_length)
def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False): """ Hamming distance between the inferred naive sequence and the tue naive sequence. <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region. NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence. if <normalize> divide by sequence length """ true_naive_seq = true_line['naive_seq'] inferred_naive_seq = line['naive_seq'] if len(true_naive_seq) != len(inferred_naive_seq): print '%20s true inf' % '' for k in true_line: print '%20s %s' % (k, true_line[k]), if k in line: print ' %s' % line[k] else: print ' NOPE' for k in line: if k not in true_line: print ' not in true line %20s %s' % (k, line[k]) raise Exception('%s true and inferred sequences not the same length\n %s\n %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq)) # assert False # read through this whole damn thing and make sure it's ok left_hack_add_on = '' right_hack_add_on = '' # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])): # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on # # if len(true_naive_seq) > len(inferred_naive_seq): # hm, now why did I use line['seq'] stuff before? # assert False # start = true_line['seq'].find(line['seq']) # assert start >= 0 # end = len(line['seq']) + start # left_hack_add_on = true_line['seq'][: start] # right_hack_add_on = true_line['seq'][ end :] # # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on) # inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on) # if debug: # print ' adding to inferred naive seq' if padfo is not None: # remove N padding from the inferred sequence if debug: print 'removing padfo' print inferred_naive_seq if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padleft %d\n' % (inferred_naive_seq, padfo['padleft']) if padfo['padright'] > 0: if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']: # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']] elif debug: # NOTE if no debug, we just fall through, which isok print 'tried to remove non Ns!\n %s\n padright %d\n' % (inferred_naive_seq, padfo['padright']) if debug: print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' ' bounds = None if restrict_to_region != '': bounds = true_line['regional_bounds'][restrict_to_region] if debug: print 'restrict to %s' % restrict_to_region utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str=' ') utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str=' ' + bounds[0]*' ') true_naive_seq = true_naive_seq[bounds[0] : bounds[1]] inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]] if len(true_naive_seq) != len(inferred_naive_seq): raise Exception('still not the same lengths for %s\n %s\n %s' % (query_name, true_naive_seq, inferred_naive_seq)) fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True) total_distance = int(fraction * len_excluding_ambig) if len(true_naive_seq) == 0: print 'WARNING zero length sequence in hamming_distance_to_true_naive' return 0 if normalize: return int(100 * (float(total_distance) / len(true_naive_seq))) else: return total_distance
def resolve_discordant_clusters(single_cluster, single_annotation, cluster_list, annotation_list, tdbg=False): # NOTE single_cluster and cluster_list in general have quite different sets of uids, and that's fine. All that matters here is we're trying to find all the clusters that should be split from one another (without doing some all against all horror) if len(cluster_list) == 1: # nothing to do return [single_cluster ] # NOTE <single_cluster> doesn't get used after here adict = utils.get_annotation_dict(annotation_list) cdr3_groups = utils.group_seqs_by_value( cluster_list, lambda c: adict[akey(c)]['cdr3_length'] ) # group the together clusters in <cluster_list> that have the same cdr3 (there's already utils.split_clusters_by_cdr3(), but it uses different inputs (e.g. sw_info) so i think it makes sense to not use it here) if tdbg: print ' %s one cluster vs %d clusters' % (utils.color( 'blue', 'syncing'), len(cluster_list)) print ' split into %d cdr3 groups' % len(cdr3_groups) lo_hbound, hi_hbound = utils.get_naive_hamming_bounds( 'likelihood', overall_mute_freq=numpy.mean( [f for l in annotation_list for f in l['mut_freqs']]) ) # these are the wider bounds, so < lo is almost certainly clonal, > hi is almost certainly not return_clusts = [] for icdr, cdrgroup in enumerate( cdr3_groups ): # within each cdr3 group, split (i.e. use the cluster boundaries from cluster_list rather than single_cluster) if naive hfrac is > hi_hbound (but then there's shenanigans to adjudicate between different possibilities) if tdbg: print ' %s hfrac bound %.2f' % (utils.color( 'purple', 'icdr %d' % icdr), hi_hbound) # first figure out who needs to be split from whom clusters_to_split = { akey(c): [] for c in cdrgroup } # map from each cluster ('s key) to a list of clusters from which it should be split for c1, c2 in itertools.combinations( cdrgroup, 2 ): # we could take account of the hfrac of both chains at this point, but looking at only the "split" one rather than the "merged" one, as we do here, is i think equivalent to assuming the merged one has zero hfrac, which is probably fine, since we only split if the split chain is very strongly suggesting we split hfrac = utils.hamming_fraction( adict[akey(c1)]['naive_seq'], adict[akey(c2)]['naive_seq'] ) # all clusters with the same cdr3 len have been padded in waterer so their naive seqs are the same length if hfrac > hi_hbound: clusters_to_split[akey(c1)].append(c2) clusters_to_split[akey(c2)].append(c1) # then do the splitting, which is accomplished by merging each cluster in <cdrgroup> with every other cluster in <cdrgroup> from which we aren't supposed to split it (i.e. that aren't in its <clusters_to_split>) if tdbg: print ' N to new' print ' size split cluster?' tmpclusts_for_return = [ ] # final (return) clusters for this cdr3 class for cclust in cdrgroup: split_clusts = clusters_to_split[akey(cclust)] if tdbg: print ' %4d %3d' % (len(cclust), len(split_clusts)), found_one = False for rclust in tmpclusts_for_return: # look for an existing return cluster to which we can merge cclust, i.e. that doesn't have any uids from which we want to split if any_in_common( [rclust], split_clusts ): # if any uid in rclust is in a cluster from which we want to be split, skip it, i.e. don't merge with that cluster (note that we have to do it by uid because the rclusts are already merged so don't necessarily correspond to any existing cluster) continue # if found_one: print 'it happened!' # can't happen any more since I switched to 'break' (although see note below) if tdbg: print ' merging with size %d' % len(rclust) rclust += cclust found_one = True break # i.e. we just merge with the first one we find and stop looking; if there's more than one, it means we could merge all three together if we wanted (triangle inequality-ish, see diagram linked at top of fcn), but i doubt it'll matter either way, and this is easier if not found_one: if tdbg: print ' y' tmpclusts_for_return.append( cclust ) # if we didn't find an existing cluster that we can add it to, add it as a new cluster return_clusts += tmpclusts_for_return if debug: print ' returning: %s' % ' '.join( [str(len(c)) for c in return_clusts]) # ptnprint(return_clusts) return return_clusts