def reassign_template_counts(self, msa_info, new_alleles, debug=False): # XXX need to update family_groups here if len(new_alleles) == 0: return if debug: print ' template new' print ' size snps snps assigned', if self.reco_info is not None: print ' true', print '' dbg_print = debug # don't print all the tiny clusters templates = {newfo['template-gene'] : newfo['gene'] for newfo in new_alleles.values()} self.adjusted_glcounts = {} for clusterfo in sorted(msa_info, key=lambda cfo: len(cfo['seqfos']), reverse=True): sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo) # it would be nice to not re-call this for the clusters we already called it on above for gene, counts in sorted_glcounts: # <gene> is the one assigned by sw before allele clustering if debug and len(clusterfo['seqfos']) < 5: if dbg_print: print ' not printing clusters smaller than 5' dbg_print = False if gene not in self.adjusted_glcounts: # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts self.adjusted_glcounts[gene] = 0 if gene in templates: # if this was a template for a new allele, we have to decide whether to apportion some or all of the sequences in this cluster to that new allele template_gene = gene template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene) cons_seq = clusterfo['cons_seq'] template_seq = self.glfo['seqs'][self.region][template_gene] new_allele_seq = new_alleles[templates[template_gene]]['seq'] compare_len = min([template_cpos, len(cons_seq), len(template_seq), len(new_allele_seq)]) # NOTE this doesn't account for indels, i.e. the template and consensus sequences are in general different lengths, but that's ok, it'll just inflate the hamming distance for sequences that differ from consensus by indels, and all we care is finding the one that doesn't have any indels n_template_snps = utils.hamming_distance(cons_seq[:compare_len], template_seq[:compare_len]) n_new_snps = utils.hamming_distance(cons_seq[:compare_len], new_allele_seq[:compare_len]) if debug and dbg_print: print ' %5d %3d %3d' % (len(clusterfo['seqfos']), n_template_snps, n_new_snps), if n_new_snps < n_template_snps: # reassign to the new allele gene = templates[template_gene] if gene not in self.adjusted_glcounts: # add it before we decide whether to switch it, so a template gene with zero counts will be in there with zero counts self.adjusted_glcounts[gene] = 0 if debug and dbg_print: print ' %s' % utils.color_gene(gene, width=15), if self.reco_info is not None: true_gene = true_sorted_glcounts[0][0] # NOTE this is the most *common* simulated gene in the cluster, not necessarily the one corresponding to these particular sequences... but clusters with new alleles should generally be dominated by one gene, so oh, well if true_gene == gene: print ' %s' % utils.color('green', 'ok'), else: print ' %s' % utils.color_gene(true_gene, width=15), print '' self.adjusted_glcounts[gene] += counts if debug: print ' final counts:' for gene, counts in sorted(self.adjusted_glcounts.items(), key=operator.itemgetter(1), reverse=True): print ' %4d %s' % (counts, utils.color_gene(gene))
def try_scratch_erode_insert(self, tmpline, debug=False): utils.remove_all_implicit_info(tmpline) for erosion in utils.real_erosions: # includes various contortions to avoid eroding the entire gene region = erosion[0] gene_length = len(self.glfo['seqs'][region][tmpline[region + '_gene']]) if region == 'd' and not utils.has_d_gene(self.args.locus): # dummy d genes: always erode the whole thing from the left assert gene_length == 1 and tmpline['d_gene'] == glutils.dummy_d_genes[self.args.locus] tmpline[erosion + '_del'] = 1 if '5p' in erosion else 0 else: max_erosion = max(0, gene_length/2 - 2) # heuristic if region in utils.conserved_codons[self.args.locus]: # make sure not to erode a conserved codon codon_pos = utils.cdn_pos(self.glfo, region, tmpline[region + '_gene']) if '3p' in erosion: n_bases_to_codon = gene_length - codon_pos - 3 elif '5p' in erosion: n_bases_to_codon = codon_pos max_erosion = min(max_erosion, n_bases_to_codon) tmpline[erosion + '_del'] = min(max_erosion, numpy.random.geometric(1. / utils.scratch_mean_erosion_lengths[erosion]) - 1) for bound in utils.boundaries: mean_length = utils.scratch_mean_insertion_lengths[self.args.locus][bound] length = 0 if mean_length == 0 else numpy.random.geometric(1. / mean_length) - 1 probs = [self.insertion_content_probs[bound][n] for n in utils.nukes] tmpline[bound + '_insertion'] = ''.join(numpy.random.choice(utils.nukes, size=length, p=probs)) if debug: print ' erosions: %s' % (' '.join([('%s %d' % (e, tmpline[e + '_del'])) for e in utils.real_erosions])) print ' insertions: %s' % (' '.join([('%s %s' % (b, tmpline[b + '_insertion'])) for b in utils.boundaries])) # have to add some things by hand so utils.add_implicit_info() doesn't barf (this duplicates code later on in recombinator) gl_seqs = {r : self.glfo['seqs'][r][tmpline[r + '_gene']] for r in utils.regions} for erosion in utils.real_erosions: region = erosion[0] e_length = tmpline[erosion + '_del'] if '5p' in erosion: gl_seqs[region] = gl_seqs[region][e_length:] elif '3p' in erosion: gl_seqs[region] = gl_seqs[region][:len(gl_seqs[region]) - e_length] tmpline['seqs'] = [gl_seqs['v'] + tmpline['vd_insertion'] + gl_seqs['d'] + tmpline['dj_insertion'] + gl_seqs['j'], ] tmpline['unique_ids'] = [None] # this is kind of hackey, but some things in the implicit info adder use it to get the number of sequences tmpline['input_seqs'] = copy.deepcopy(tmpline['seqs']) # NOTE has to be updated _immediately_ so seqs and input_seqs don't get out of sync tmpline['indelfos'] = [indelutils.get_empty_indel(), ] utils.add_implicit_info(self.glfo, tmpline) assert len(tmpline['in_frames']) == 1
def get_alleles(self, swfo, plotdir=None, debug=False): print 'clustering for new alleles' # NOTE do *not* modify <self.glfo> (in the future it would be nice to just modify <self.glfo>, but for now we need it to be super clear in partitiondriver what is happening to <self.glfo>) default_initial_glfo = self.glfo if self.args.default_initial_germline_dir is not None: # if this is set, we want to take any new allele names from this directory's glfo if they're in there default_initial_glfo = glutils.read_glfo(self.args.default_initial_germline_dir, self.glfo['locus']) glfo_to_modify = copy.deepcopy(default_initial_glfo) # so we can add new genes to it, so we can check for equivalency more easily TODO fix that shit, obviously else: print ' %s --default-initial-germline-dir isn\'t set, so new allele names won\'t correspond to existing names' % utils.color('yellow', 'warning') qr_seqs, threshold = self.choose_clonal_representatives(swfo, debug=debug) if qr_seqs is None: return {} # self.check_for_donuts(debug=debug) if not self.args.kmeans_allele_cluster: clusterfos, msa_info = self.vsearch_cluster_v_seqs(qr_seqs, threshold, debug=debug) else: clusterfos = self.kmeans_cluster_v_seqs(qr_seqs, swfo, plotdir=plotdir, debug=debug) msa_info = clusterfos # and finally loop over each cluster, deciding if it corresponds to a new allele if debug: print ' looping over %d clusters with %d sequences' % (len(clusterfos), sum([len(cfo['seqfos']) for cfo in clusterfos])) print ' rank seqs v/j mfreq seqs snps (%s)' % utils.color('blue', 'indels') new_alleles = {} n_existing_gene_clusters = 0 for iclust in range(len(clusterfos)): clusterfo = clusterfos[iclust] # dot_products = [utils.dot_product(clusterfo['cons_seq'], seq1, seq2) for seq1, seq2 in itertools.combinations([seqfo['seq'] for seqfo in clusterfo['seqfos']], 2)] # mean_dot_product = numpy.average(dot_products) # choose the most common existing gene to use as a template (the most similar gene might be a better choice, but deciding on "most similar" would involve adjudicating between snps and indels, and it shouldn't really matter) sorted_glcounts, true_sorted_glcounts = self.get_glcounts(clusterfo) template_gene, template_counts = sorted_glcounts[0] template_seq = self.glfo['seqs'][self.region][template_gene] template_cpos = utils.cdn_pos(self.glfo, self.region, template_gene) assert '.' not in clusterfo['cons_seq'] # make sure you haven't switched to something that doesn't use '-' for gap chars new_seq = clusterfo['cons_seq'].replace('-', '') # I'm not sure that I completely understand the dashes in this sequence, but it seems to be right to just remove 'em aligned_template_seq, aligned_new_seq = utils.align_seqs(template_seq, clusterfo['cons_seq']) has_indels = '-' in aligned_template_seq.strip('-') or '-' in aligned_new_seq.strip('-') # only counts internal indels cluster_mfreqs = {r : [self.mfreqs[r][seqfo['name']] for seqfo in clusterfo['seqfos']] for r in self.mfreqs} # regional mfreqs for each sequence in the cluster corresponding to the initially-assigned existing gene mean_cluster_mfreqs = {r : numpy.mean(cluster_mfreqs[r]) for r in cluster_mfreqs} equiv_name, equiv_seq = glutils.find_equivalent_gene_in_glfo(glfo_to_modify, new_seq, template_cpos) if equiv_name is not None: new_name = equiv_name new_seq = equiv_seq else: new_name, _ = glutils.choose_new_allele_name(template_gene, new_seq, indelfo={'indels' : ['xxx', 'xxx', 'xxx']} if has_indels else None) # the fcn just checks to see if it's non-None and of length greater than zero...TODO it would be nice to figure out actual snp and indel info if debug: self.print_cluster(iclust, clusterfo, sorted_glcounts, new_seq, true_sorted_glcounts, mean_cluster_mfreqs, has_indels) if new_name in self.glfo['seqs'][self.region]: # note that this only looks in <self.glfo>, not in <new_alleles> n_existing_gene_clusters += 1 if debug: print 'existing %s' % utils.color_gene(new_name) continue if new_name in new_alleles: # already added it NOTE might make more sense to use <glfo_to_modify> here instead of <new_alleles> (or just not have freaking both of them) if debug: print '%s (%s)' % (utils.color_gene(new_name), utils.color('red', 'new')) continue assert new_seq not in new_alleles.values() # if it's the same seq, it should've got the same damn name if not has_indels: # we assume that the presence of indels somewhat precludes false positives, which is equivalent to an assumption about the rarity of shm indels if self.too_close_to_existing_glfo_gene(clusterfo, new_seq, template_seq, template_cpos, template_gene, debug=debug): # presumably if it were really close to another (non-template) existing glfo gene, that one would've been the template continue if mean_cluster_mfreqs['j'] > 0. and self.mean_mfreqs['j'] > 0.: this_cluster_ratio = mean_cluster_mfreqs['v'] / mean_cluster_mfreqs['j'] overall_ratio = self.mean_mfreqs['v'] / self.mean_mfreqs['j'] if this_cluster_ratio / overall_ratio < self.mfreq_ratio_threshold: if debug: print 'v / j cluster mfreqs too small %6.3f / %6.3f = %6.3f < %6.3f' % (this_cluster_ratio, overall_ratio, this_cluster_ratio / overall_ratio, self.mfreq_ratio_threshold) continue if self.too_close_to_already_added_gene(new_seq, new_alleles, debug=debug): # this needs to be applied even if there are indels, since the indels are with respect to the (existing glfo) template gene, not to the [potentially] previously-added gene continue print '%s %s%s' % (utils.color('red', 'new'), utils.color_gene(new_name), ' (exists in default germline dir)' if new_name in default_initial_glfo['seqs'][self.region] else '') new_alleles[new_name] = {'template-gene' : template_gene, 'gene' : new_name, 'seq' : new_seq} if new_alleles[new_name]['gene'] not in glfo_to_modify['seqs'][self.region]: # if it's in <default_initial_glfo> it'll already be in there glutils.add_new_allele(glfo_to_modify, new_alleles[new_name]) # just so we can check for equivalency if debug: print ' %d / %d clusters consensed to existing genes' % (n_existing_gene_clusters, len(msa_info)) self.reassign_template_counts(msa_info, new_alleles, debug=False) for new_name, newfo in new_alleles.items(): # print '%s %s %.1f / %.1f = %.4f' % (new_name, newfo['template-gene'], self.adjusted_glcounts[newfo['template-gene']], float(sum(self.adjusted_glcounts.values())), self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values()))) if self.adjusted_glcounts[newfo['template-gene']] / float(sum(self.adjusted_glcounts.values())) < self.args.min_allele_prevalence_fraction: # NOTE self.adjusted_glcounts only includes large clusters, and the constituents of those clusters are clonal representatives, so this isn't quite the same as in alleleremover newfo['remove-template-gene'] = True return new_alleles
print '%s:' % utils.color('yellow', name) glfos.append(glutils.read_glfo(gldir, args.locus, debug=True)) for region in [r for r in utils.regions if r in glfos[0]['seqs']]: aset, bset = [set(g['seqs'][region]) for g in glfos] tmpfo = glutils.get_empty_glfo( args.locus) # make a new glfo that will only have non-shared genes for glabel, gset, gfo in zip( args.names, [aset - bset, bset - aset], glfos): # <gset> is the genes that're only in <glabel> for ogene in gset: glutils.add_new_allele(tmpfo, { 'gene': '+'.join([ogene, glabel]), 'seq': gfo['seqs'][region][ogene], 'cpos': utils.cdn_pos(gfo, region, ogene) }, use_template_for_codon_info=False) # eh, maybe this doesn't really add anything? # # add the nearest genes that they both have for comparison NOTE this gives one comparison gene for *each* gene, so usually you get a bunch of comparison/'both' genes in each block in the ascii output # for bgene in aset & bset: # _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfos[0], glfos[0]['seqs'][region][bgene], new_cpos=utils.cdn_pos(glfos[0], region, bgene)) # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one # glutils.add_new_allele(tmpfo, {'gene' : '+'.join([nearest_gene, 'both']), 'seq' : glfos[0]['seqs'][region][nearest_gene], 'cpos' : utils.cdn_pos(glfos[0], region, bgene)}, use_template_for_codon_info=False) print '%s: only in:\n %12s: %2d %s\n %12s: %2d %s' % ( utils.color('green', region), args.names[0], len(aset - bset), utils.color_genes(sorted(aset - bset)), args.names[1], len(bset - aset), utils.color_genes(sorted(bset - aset))) if len(tmpfo['seqs'][region]) > 0: print ' comparing to nearest genes that were in both (labeled \'both\'):'
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color( 'red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[ self.glfo['locus']]: xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene) if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr + '_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr + '_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
# then add any that are only in the second one for gene, seq in glfos[1]['seqs'][region].items(): if gene not in glfos[0]['seqs'][region]: cpos = glfos[1][utils.cdn(glfos[1], region) + '-positions'][gene] if utils.cdn( glfos[1], region) is not None else None glutils.add_new_allele( tmp_glfo, { 'gene': '+'.join([gene, args.names[1]]), 'seq': seq, 'cpos': cpos }, use_template_for_codon_info=False ) # can't use template cause we might've deleted it in the first loop # then add the nearest genes that they both have for comparison for gene, seq in tmp_glfo['seqs'][region].items(): _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos( glfos[0], seq, new_cpos=utils.cdn_pos(tmp_glfo, region, gene) ) # i think it doesn't matter which glfo we get it from, so arbitrarily choose the first one glutils.add_new_allele( tmp_glfo, { 'gene': '+'.join([nearest_gene, 'both']), 'seq': glfos[0]['seqs'][region][nearest_gene], 'template-gene': gene }) if len(tmp_glfo['seqs'][region]) > 0: print ' comparing to nearest genes that were in both (labeled \'both\'):' glutils.print_glfo(tmp_glfo, only_region=region)