class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo, exclusions=args.region_end_exclusions) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + '_gene' for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { n: 0 for n in utils.nukes } # base content of each insertion self.string_columns.add(bound + '_insertion_content') self.counts['aa_cdr3_length'] = {} self.counts['non_vj_length'] = {} self.counts['seq_content'] = { n: 0 for n in utils.nukes } # now I'm adding the aa content, I wish this had nucleotide in the name, but I don't want to change it since it corresponds to a million existing file paths self.counts['cluster_size'] = {} self.init_aa_stuff() self.counts['seq_aa_content'] = {a: 0 for a in self.all_aa} self.string_columns.add('seq_content') self.string_columns.add('seq_aa_content') self.no_write_columns = [ 'aa_cdr3_length', 'non_vj_length', 'seq_aa_content' ] # don't write these to the parameter dir, since a) cdr3 length is better viewed as an output of more fundamental parameters (gene choice, insertion + deletion lengths) and b) I"m adding them waaay long after the others, and I don't want to add a new file to the established parameter directory structure. (I'm adding these because I want them plotted) self.columns_to_subset_by_gene = [ e + '_del' for e in utils.all_erosions ] + [b + '_insertion' for b in utils.boundaries] self.mean_columns = ['aa_cdr3_length', 'non_vj_length'] # ---------------------------------------------------------------------------------------- def init_aa_stuff(self): codons = itertools.product( utils.nukes + ['N'], repeat=3 ) # I cannot for the life of me find anything in Bio that will give me the list of amino acids, wtf, but I'm tired of googling, this will be fine self.all_aa = set([utils.ltranslate(''.join(c)) for c in codons]) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info['seqs'])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in utils.nukes: self.counts['seq_content'][nuke] += info['seqs'][iseq].count(nuke) # aa seq content stuff nseq = info['seqs'][iseq] if info['v_5p_del'] > 0: nseq = info['v_5p_del'] * utils.ambig_base + nseq if len(info['fv_insertion']) > 0: nseq = nseq[len(info['fv_insertion']):] if len(nseq) % 3 != 0: nseq += utils.ambig_base * ( 3 - (len(nseq) % 3) ) # I think I could replace this with the new utils.pad_nuc_seq() aaseq = utils.ltranslate(nseq) for aa in self.all_aa: self.counts['seq_aa_content'][aa] += aaseq.count(aa) # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ def sub_increment(column, index): if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 self.reco_total += 1 all_index = self.get_index( info, tuple(list(utils.index_columns) + [ 'cdr3_length', ]) ) # NOTE this cdr3_length is for getting a unique index for the rearrangement event parameters, and is thus unrelated to the key aa_cdr3_length for plotting if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) sub_increment(column, index) # have to be done separately, since they're not index columns (and we don't want them to be, since they're better viewed as derivative -- see note in self.write()) sub_increment( 'aa_cdr3_length', (info['cdr3_length'] / 3, ) ) # oh, jeez, this has to be a tuple to match the index columns, that's ugly sub_increment('non_vj_length', (utils.get_non_vj_len(info), )) sub_increment('cluster_size', (len(info['unique_ids']), )) for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke == utils.ambig_base: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot( self, plotdir, only_csv=False, only_overall=False, make_per_base_plots=False ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it write() wasn't called first, that is) import plotting print ' plotting parameters in %s' % plotdir, sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall, make_per_base_plots=make_per_base_plots) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = hutils.make_hist_from_dict_of_counts( values, var_type, column) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, stats='mean' if column in self.mean_columns else None, normalize=True) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = hutils.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get( column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write( self, base_outdir ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it plot() wasn't called first, that is) print ' writing parameters to %s' % base_outdir, sys.stdout.flush() start = time.time() if os.path.exists(base_outdir + '/' + glutils.glfo_dir): for tmploc in [ l for l in utils.loci if os.path.exists(base_outdir + '/' + glutils.glfo_dir + '/' + l) ]: glutils.remove_glfo_files(base_outdir + '/' + glutils.glfo_dir, tmploc, print_warning=False) utils.prep_dir( base_outdir, subdirs=('hmms', 'mute-freqs', glutils.glfo_dir), wildlings=('*.csv', '*.yaml', '*.fasta') ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) genes_with_counts = [ g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys() ] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column in self.no_write_columns: continue elif column == 'all': index = tuple(list(utils.index_columns) + [ 'cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column or column == 'cluster_size': index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time() - start)
class AlleleFinder(object): def __init__(self, glfo, args): self.glfo = glfo self.args = args self.new_allele_info = [] self.small_number = 1e-5 self.n_max_mutations_per_segment = 20 # don't look at sequences whose v segments have more than this many mutations self.n_max_snps = self.n_max_mutations_per_segment - 9 # try excluding up to this many bins (on the left) when doing the fit (leaves at least 9 points for fit) self.max_fit_length = 10 # don't fit more than this many bins for each istart (the first few positions in the fit are the most important, and if we fit too far to the right these important positions get diluted) self.n_muted_min = 15 # don't fit positions that have fewer mutations than this self.n_total_min = 15 # ...or fewer total observations than this self.n_five_prime_positions_to_exclude = 5 # skip positions that are too close to the 5' end of V (misassigned insertions look like snps) self.min_non_candidate_positions_to_fit = 10 # always fit at least a few non-candidate positions self.min_y_intercept = 0.15 # corresponds, roughly, to the expression level of the least common allele to which we have sensitivity self.default_slope_bounds = (-0.2, 0.2) # fitting function needs some reasonable bounds from which to start self.big_y_icpt_bounds = (self.min_y_intercept, 1.5) # snp-candidate positions should fit well when forced to use these bounds, but non-snp positions should fit like &*@!* # self.min_score = 2 # (mean ratio over snp candidates) - (first non-candidate ratio) must be greater than this self.min_min_candidate_ratio = 2.25 # every candidate ratio must be greater than this # self.max_non_candidate_ratio = 2. # first non-candidate has to be smaller than this self.min_snp_big_icpt_residual = 2. # snp candidates must have a better (smaller residual) big-intercept fit than this self.fitted_positions = {} # positions that, for any <istart>, we have fit info self.mfreqer = MuteFreqer(glfo) self.gene_obs_counts = {} # only used for allele-finding self.counts = {} self.plotvals = {} self.finalized = False # ---------------------------------------------------------------------------------------- def increment(self, info): self.mfreqer.increment(info) for region in utils.regions: regional_freq, len_excluding_ambig = utils.get_mutation_rate(info, restrict_to_region=region, return_len_excluding_ambig=True) n_mutes = regional_freq * len_excluding_ambig # total number of mutations in the region (for allele finding stuff) if abs(n_mutes - int(n_mutes)) > 1e6: raise Exception('n mutated %f not an integer' % n_mutes) n_mutes = int(n_mutes) gene = info[region + '_gene'] if gene not in self.counts: self.counts[gene] = {} self.gene_obs_counts[gene] = 0 self.gene_obs_counts[gene] += 1 gcts = self.counts[gene] # shorthand name germline_seq = info[region + '_gl_seq'] query_seq = info[region + '_qr_seq'] assert len(germline_seq) == len(query_seq) for ipos in range(len(germline_seq)): igl = ipos + int(info[region + '_5p_del']) # account for left-side deletions in the indexing if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases: # skip if either germline or query sequence is ambiguous at this position continue if igl not in gcts: # if we have not yet observed this position in a query sequence, initialize it gcts[igl] = {} if igl not in gcts: gcts[igl] = {} if utils.get_region(gene) == 'v': if n_mutes not in gcts[igl]: gcts[igl][n_mutes] = {n : 0 for n in ['muted', 'total'] + utils.nukes} gcts[igl][n_mutes]['total'] += 1 if query_seq[ipos] != germline_seq[ipos]: # if this position is mutated gcts[igl][n_mutes]['muted'] += 1 # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency gcts[igl][n_mutes][query_seq[ipos]] += 1 # only used to work out what the snp'd base is if there's a new allele # ---------------------------------------------------------------------------------------- def get_residual_sum(self, xvals, yvals, errs, slope, intercept): def expected(x): return slope * x + intercept residual_sum = sum([(y - expected(x))**2 / err**2 for x, y, err in zip(xvals, yvals, errs)]) return residual_sum # ---------------------------------------------------------------------------------------- def get_curvefit(self, n_mutelist, freqs, errs, y_icpt_bounds=None): def func(x, slope, y_icpt): return slope*x + y_icpt bounds = (-float('inf'), float('inf')) if y_icpt_bounds is not None: bounds = [[s, y] for s, y in zip(self.default_slope_bounds, y_icpt_bounds)] params, cov = scipy.optimize.curve_fit(func, n_mutelist, freqs, sigma=errs, bounds=bounds) slope, slope_err = params[0], math.sqrt(cov[0][0]) y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1]) residual_sum = self.get_residual_sum(n_mutelist, freqs, errs, slope, y_icpt) ndof = len(n_mutelist) - 1 fitfo = { 'slope' : slope, 'y_icpt' : y_icpt, 'slope_err' : slope_err, 'y_icpt_err' : y_icpt_err, 'residuals_over_ndof' : float(residual_sum) / ndof, 'print_str' : ' %9.3f +/- %-9.3f %7.4f +/- %7.4f %7.4f' % (y_icpt, y_icpt_err, slope, slope_err, float(residual_sum) / ndof), 'n_mutelist' : n_mutelist, 'freqs' : freqs, 'errs' : errs } return fitfo # ---------------------------------------------------------------------------------------- def get_allele_finding_xyvals(self, gene, position): gpcounts = self.counts[gene][position] iterinfo = gpcounts.items() obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] errs = [(hi - lo) / 2 for lo, hi, _ in lohis] weights = [1./(e*e) for e in errs] freqs = [float(d['muted']) / d['total'] if d['total'] > 0 else 0. for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment] n_mutelist = [nm for nm in gpcounts.keys() if nm < self.n_max_mutations_per_segment] return {'obs' : obs, 'total' : total, 'n_mutelist' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'weights' : weights} # ---------------------------------------------------------------------------------------- def is_a_candidate(self, gene, fitfo, istart, debug=False): # NOTE I've tried adding a requirement on the actual value of the big-icpt (or, equivalently, small-icpt) fit, but it seems to be better to just use the ratio (probably because stuff is correlated) if fitfo['min_snp_ratios'][istart] < self.min_min_candidate_ratio: # worst snp candidate has to be pretty good on its own if debug: print ' min snp ratio %s too small (less than %s)' % (fstr(fitfo['min_snp_ratios'][istart]), fstr(self.min_min_candidate_ratio)) return False for candidate_pos in fitfo['candidates'][istart]: # return false if any of the candidate positions don't have enough counts with <istart> mutations (probably a homozygous new allele with more than <istart> snps) if istart not in self.counts[gene][candidate_pos] or self.counts[gene][candidate_pos][istart]['total'] < self.n_total_min: if debug: print ' not enough counts at this position with %d mutations (%s < %s)' % (istart, fstr(self.counts[gene][candidate_pos][istart]['total']), fstr(self.n_total_min)) return False if debug: print ' candidate' return True # ---------------------------------------------------------------------------------------- def get_positions_to_fit(self, gene, gene_results, debug=False): self.fitted_positions[gene] = set() positions = sorted(self.mfreqer.counts[gene].keys()) xyvals = {pos : self.get_allele_finding_xyvals(gene, pos) for pos in positions} positions_to_try_to_fit = [pos for pos in positions if sum(xyvals[pos]['obs']) > self.n_muted_min or sum(xyvals[pos]['total']) > self.n_total_min] # ignore positions with neither enough mutations or total observations if len(positions_to_try_to_fit) < self.n_max_snps - 1 + self.min_non_candidate_positions_to_fit: gene_results['not_enough_obs_to_fit'].add(gene) if debug: print ' not enough positions with enough observations to fit %s' % utils.color_gene(gene) return None, None if debug and len(positions) > len(positions_to_try_to_fit): print ' skipping %d / %d positions (with fewer than %d mutations and %d observations)' % (len(positions) - len(positions_to_try_to_fit), len(positions), self.n_muted_min, self.n_total_min) self.plotvals[gene] = {} for pos in positions_to_try_to_fit: self.plotvals[gene][pos] = xyvals[pos] return positions_to_try_to_fit, xyvals # ---------------------------------------------------------------------------------------- def fit_istart(self, gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=False): residuals = {} for pos in positions_to_try_to_fit: # skip positions that are too close to the 5' end of V (misassigned insertions look like snps) if pos > len(self.glfo['seqs'][utils.get_region(gene)][gene]) - self.n_five_prime_positions_to_exclude - 1: continue # as long as we already have a few non-candidate positions, skip positions that have no frequencies greater than the min y intercept (note that they could in principle still have a large y intercept, but we don't really care) if len(residuals) > istart + self.min_non_candidate_positions_to_fit and len([f for f in subxyvals[pos]['freqs'] if f > self.min_y_intercept]) == 0: continue if sum(subxyvals[pos]['total']) < self.n_total_min: continue # also skip positions that only have a few points to fit (i.e. genes that were very rare, or I guess maybe if they were always eroded past this position) if len(subxyvals[pos]['n_mutelist']) < 3: continue zero_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=(0. - self.small_number, 0. + self.small_number)) big_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=self.big_y_icpt_bounds) residuals[pos] = {'zero_icpt' : zero_icpt_fit['residuals_over_ndof'], 'big_icpt' : big_icpt_fit['residuals_over_ndof']} self.fitted_positions[gene].add(pos) # if we already did the fit for another <istart>, it'll already be in there if len(residuals) <= istart: # needs to be at least one longer, so we have the first-non-snp if debug: print ' not enough observations to fit more than %d snps' % (istart - 1) return residual_ratios = {pos : float('inf') if r['big_icpt'] == 0. else r['zero_icpt'] / r['big_icpt'] for pos, r in residuals.items()} sorted_ratios = sorted(residual_ratios.items(), key=operator.itemgetter(1), reverse=True) # sort the positions in decreasing order of residual ratio candidate_snps = [pos for pos, _ in sorted_ratios[:istart]] # the first <istart> positions are the "candidate snps" max_non_snp, max_non_snp_ratio = sorted_ratios[istart] # position and ratio for largest non-candidate min_candidate_ratio = min([residual_ratios[cs] for cs in candidate_snps]) # fitfo['scores'][istart] = (min_candidate_ratio - max_non_snp_ratio) / max(self.small_number, max_non_snp_ratio) fitfo['min_snp_ratios'][istart] = min([residual_ratios[cs] for cs in candidate_snps]) fitfo['candidates'][istart] = {cp : residual_ratios[cp] for cp in candidate_snps} if debug: # if debug > 1: # print '%70s %s' % ('', ''.join(['%11d' % nm for nm in subxyvals[max_non_snp]['n_mutelist']])) for pos in candidate_snps + [max_non_snp, ]: xtrastrs = ('[', ']') if pos == max_non_snp else (' ', ' ') pos_str = '%3s' % str(pos) if residual_ratios[pos] > self.min_min_candidate_ratio: pos_str = utils.color('yellow', pos_str) print ' %s %s %5s (%5s / %-5s) %4d / %-4d %s' % (xtrastrs[0], pos_str, fstr(residual_ratios[pos]), fstr(residuals[pos]['zero_icpt']), fstr(residuals[pos]['big_icpt']), sum(subxyvals[pos]['obs']), sum(subxyvals[pos]['total']), xtrastrs[1]), # if debug > 1: # print ' ', ''.join(['%4d / %-4d' % (subxyvals[pos]['obs'][inm], subxyvals[pos]['total'][inm]) for inm in range(len(subxyvals[pos]['n_mutelist']))]) print '' # ---------------------------------------------------------------------------------------- def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False): # figure out what the new nukes are old_seq = self.glfo['seqs'][utils.get_region(gene)][gene] new_seq = old_seq mutfo = {} for pos in sorted(fitfo['candidates'][n_candidate_snps]): obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes} # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True) original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke'] new_nuke = None for nuke, _ in sorted_obs_counts: # take the most common one that isn't the existing gl nuke if nuke != original_nuke: new_nuke = nuke break print ' %3d (%s --> %s)' % (pos, original_nuke, new_nuke), assert old_seq[pos] == original_nuke mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke} new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:] new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo) print '' print ' %s %s' % (old_seq, utils.color_gene(gene)) print ' %s %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name)) # and add it to the set of new alleles for this gene self.new_allele_info.append({ 'template-gene' : gene, 'gene' : new_name, 'seq' : new_seq, 'aligned-seq' : None }) # ---------------------------------------------------------------------------------------- def finalize(self, debug=False): assert not self.finalized self.mfreqer.finalize() start = time.time() gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()} if debug: print '\nlooking for new alleles:' for gene in sorted(self.mfreqer.counts): if utils.get_region(gene) != 'v': continue if debug: print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene])) positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug) if positions_to_try_to_fit is None: continue fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')} for istart in range(1, self.n_max_snps): if debug: if istart == 1: print ' resid. / ndof' print ' position ratio (m=0 / m>%5.2f) muted / obs ' % self.big_y_icpt_bounds[0] print ' %d %s' % (istart, utils.plural_str('snp', istart)) subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit} self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug) if istart not in fitfo['candidates']: # if it didn't get filled, we didn't have enough observations to do the fit break istart_candidates = [] if debug: print ' evaluating each snp hypothesis' print ' snps min ratio' for istart in fitfo['candidates']: if debug: print ' %2d %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])), if self.is_a_candidate(gene, fitfo, istart, debug=debug): istart_candidates.append(istart) if len(istart_candidates) > 0: n_candidate_snps = min(istart_candidates) # add the candidate with the smallest number of snps to the germline set, and run again (if the firs gene_results['new_allele'].add(gene) print '\n found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps, utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)), self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug) else: gene_results['didnt_find_anything_with_fit'].add(gene) if debug: print ' no new alleles' if debug: print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])), len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit'])) print ' allele finding time: %.1f' % (time.time()-start) self.finalized = True # ---------------------------------------------------------------------------------------- def plot(self, base_plotdir, only_csv=False): if not self.finalized: self.finalize(debug=debug) plotdir = base_plotdir + '/allele-finding' for old_gene_dir in glob.glob(plotdir + '/*'): # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir if not os.path.isdir(old_gene_dir): raise Exception('not a directory: %s' % old_gene_dir) utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg')) os.rmdir(old_gene_dir) utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg')) if only_csv: # not implemented return start = time.time() for gene in self.plotvals: if utils.get_region(gene) != 'v': continue for position in self.plotvals[gene]: if position not in self.fitted_positions[gene]: # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow continue # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None: # continue plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position]) print ' allele finding plot time: %.1f' % (time.time()-start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes} # base content of each insertion self.counts['seq_content'] = {n : 0 for n in utils.nukes} # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment_all_params(self, info): self.increment_per_sequence_params(info) self.increment_per_family_params(info) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info) seq = info['seq'] for nuke in seq: if nuke in utils.ambiguous_bases: continue self.counts['seq_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index(info, tuple(list(utils.index_columns) + ['cdr3_length', ])) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke in utils.ambiguous_bases: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir, subset_by_gene): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall') #, multilings=('*.csv', '*.svg')) for column in self.counts: if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False): print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir, subset_by_gene) self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: raise Exception('no counts in %s' % column) for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time()-start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir, my_datadir=None): print ' writing parameters', sys.stdout.flush() start = time.time() utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta')) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + ['cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time()-start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__( self, germline_seqs ): #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True): self.total = 0 self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { 'A': 0, 'C': 0, 'G': 0, 'T': 0 } # base content of each insertion self.counts['seq_content'] = {'A': 0, 'C': 0, 'G': 0, 'T': 0} self.mutefreqer = MuteFreqer( germline_seqs ) #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters) # ---------------------------------------------------------------------------------------- def clean(self): """ remove all the parameter files """ self.mutefreqer.clean() for column in self.counts: if column == 'all': os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all')) else: index = [ column, ] + utils.column_dependencies[column] os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.total += 1 all_index = self.get_index(info, utils.index_columns) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: self.counts[bound + '_insertion_content'][nuke] += 1 for nuke in info['seq']: self.counts['seq_content'][nuke] += 1 self.mutefreqer.increment(info) # ---------------------------------------------------------------------------------------- def __str__(self): return_str = [] print 'hm I think I was too lazy to put \'all\' in this string' print ' or [vdj]_insertion_content or seq_content' for column in self.counts: return_str.append('%s\n' % column) return_str.append('%20s' % column) for dep in utils.column_dependencies[column]: return_str.append('%20s' % dep) return_str.append('\n') for index, count in self.counts[column].iteritems(): for val in index: return_str.append('%20s' % str(val)) return_str.append( ' %d / %d = %f\n' % (count, self.total, float(count) / self.total)) return ''.join(return_str) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[ 1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[ 1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ( '_del' in column or column == 'vd_insertion' or column == 'dj_insertion' ): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', thisplotdir] ) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot( plotdir, cyst_positions, tryp_positions ) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call( ['./bin/permissify-www', plotdir] ) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write( base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo, exclusions=args.region_end_exclusions) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + '_gene' for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = { n: 0 for n in utils.nukes } # base content of each insertion self.string_columns.add(bound + '_insertion_content') self.counts['seq_content'] = {n: 0 for n in utils.nukes} self.string_columns.add('seq_content') self.columns_to_subset_by_gene = [ e + '_del' for e in utils.all_erosions ] + [b + '_insertion' for b in utils.boundaries] # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info['seqs'])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in info['seqs'][iseq]: if nuke in utils.ambiguous_bases: continue self.counts['seq_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index( info, tuple(list(utils.index_columns) + [ 'cdr3_length', ])) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: if nuke in utils.ambiguous_bases: continue self.counts[bound + '_insertion_content'][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + '/mute-freqs') utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg']) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, only_csv=False, only_overall=False): import plotting print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts( gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get( column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters', sys.stdout.flush() start = time.time() if os.path.exists(base_outdir + '/' + glutils.glfo_dir): glutils.remove_glfo_files( base_outdir + '/' + glutils.glfo_dir, self.glfo['locus'] ) # NOTE I think this will fail if I ever start having multiple loci in one dir utils.prep_dir( base_outdir, subdirs=('hmms', 'mute-freqs', glutils.glfo_dir), wildlings=('*.csv', '*.yaml', '*.fasta') ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) genes_with_counts = [ g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys() ] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == 'all': index = tuple(list(utils.index_columns) + [ 'cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column: index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, glfo, args): self.glfo = glfo self.args = args self.mfreqer = MuteFreqer(self.glfo) self.reco_total = 0 # total number of recombination events self.mute_total = 0 # total number of sequences self.counts = {} self.counts["all"] = {} for column in utils.column_dependencies: self.counts[column] = {} self.string_columns = set([r + "_gene" for r in utils.regions]) for bound in utils.boundaries: self.counts[bound + "_insertion_content"] = {n: 0 for n in utils.nukes} # base content of each insertion self.string_columns.add(bound + "_insertion_content") self.counts["seq_content"] = {n: 0 for n in utils.nukes} self.string_columns.add("seq_content") self.columns_to_subset_by_gene = [e + "_del" for e in utils.real_erosions + utils.effective_erosions] + [ b + "_insertion" for b in utils.boundaries ] # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == "_insertion": # insertion length index.append(len(info[ic])) else: assert "insertion" not in ic assert "content" not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.increment_per_family_params(info) for iseq in range(len(info["seqs"])): self.increment_per_sequence_params(info, iseq) # ---------------------------------------------------------------------------------------- def increment_per_sequence_params(self, info, iseq): """ increment parameters that differ for each sequence within the clonal family """ self.mute_total += 1 self.mfreqer.increment(info, iseq) for nuke in info["seqs"][iseq]: if nuke in utils.ambiguous_bases: continue self.counts["seq_content"][nuke] += 1 # ---------------------------------------------------------------------------------------- def increment_per_family_params(self, info): """ increment parameters that are the same for the entire clonal family """ self.reco_total += 1 all_index = self.get_index(info, tuple(list(utils.index_columns) + ["cdr3_length"])) if all_index not in self.counts["all"]: self.counts["all"][all_index] = 0 self.counts["all"][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + "_insertion"]: if nuke in utils.ambiguous_bases: continue self.counts[bound + "_insertion_content"][nuke] += 1 # ---------------------------------------------------------------------------------------- def clean_plots(self, plotdir): self.mfreqer.clean_plots(plotdir + "/mute-freqs") utils.prep_dir(plotdir + "/overall") # , multilings=('*.csv', '*.svg')) for column in self.counts: if column in self.columns_to_subset_by_gene: thisplotdir = plotdir + "/" + column utils.prep_dir(thisplotdir, wildlings=["*.csv", "*.svg"]) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, only_csv=False, only_overall=False): print " plotting parameters", sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + "/overall" for column in self.counts: if column == "all": continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1 ] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = "string" if column in self.string_columns else "int" hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, ) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + "/" + column for gene in gene_values: plotname = utils.sanitize_name(gene) + "-" + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root( hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv, ) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print "(%.1f sec)" % (time.time() - start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print " writing parameters", sys.stdout.flush() start = time.time() utils.prep_dir( base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta") ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv" ) # REGION is replace by each region in the three output files) genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()] glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column == "all": index = tuple(list(utils.index_columns) + ["cdr3_length"]) outfname = base_outdir + "/" + utils.get_parameter_fname(column="all") elif "_content" in column: index = [column] outfname = base_outdir + "/" + column + ".csv" else: index = [column] + utils.column_dependencies[column] outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener("w")(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append("count") out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line["count"] = count out_data.writerow(line) print "(%.1f sec)" % (time.time() - start)
class ParameterCounter(object): """ class to keep track of how many times we've seen each gene version, erosion length, insertion (length and base content), and mutation """ def __init__(self, germline_seqs): #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True): self.total = 0 self.counts = {} self.counts['all'] = {} for column in utils.column_dependencies: self.counts[column] = {} for bound in utils.boundaries: self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0} # base content of each insertion self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0} self.mutefreqer = MuteFreqer(germline_seqs) #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters) # ---------------------------------------------------------------------------------------- def clean(self): """ remove all the parameter files """ self.mutefreqer.clean() for column in self.counts: if column == 'all': os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all')) else: index = [column,] + utils.column_dependencies[column] os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)) # ---------------------------------------------------------------------------------------- def get_index(self, info, deps): index = [] for ic in deps: if ic[2:] == '_insertion': # insertion length index.append(len(info[ic])) else: assert 'insertion' not in ic assert 'content' not in ic index.append(info[ic]) return tuple(index) # ---------------------------------------------------------------------------------------- def increment(self, info): self.total += 1 all_index = self.get_index(info, utils.index_columns) if all_index not in self.counts['all']: self.counts['all'][all_index] = 0 self.counts['all'][all_index] += 1 for deps in utils.column_dependency_tuples: column = deps[0] index = self.get_index(info, deps) if index not in self.counts[column]: self.counts[column][index] = 0 self.counts[column][index] += 1 for bound in utils.boundaries: for nuke in info[bound + '_insertion']: self.counts[bound + '_insertion_content'][nuke] += 1 for nuke in info['seq']: self.counts['seq_content'][nuke] += 1 self.mutefreqer.increment(info) # ---------------------------------------------------------------------------------------- def __str__(self): return_str = [] print 'hm I think I was too lazy to put \'all\' in this string' print ' or [vdj]_insertion_content or seq_content' for column in self.counts: return_str.append('%s\n' % column) return_str.append('%20s' % column) for dep in utils.column_dependencies[column]: return_str.append('%20s' % dep) return_str.append('\n') for index, count in self.counts[column].iteritems(): for val in index: return_str.append('%20s' % str(val)) return_str.append(' %d / %d = %f\n' % (count, self.total, float(count) / self.total)) return ''.join(return_str) # ---------------------------------------------------------------------------------------- def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None): print ' plotting parameters' start = time.time() utils.prep_dir(plotdir + '/plots') #, multilings=('*.csv', '*.svg')) for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: print 'ERROR no counts in %s' % column assert False for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg']) for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True) check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', thisplotdir]) # NOTE this should really permissify starting a few directories higher up plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True) self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files if has_root: check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg']) check_call(['./bin/permissify-www', plotdir]) # NOTE this should really permissify starting a few directories higher up print ' parameter plot time: %.3f' % (time.time()-start) # ---------------------------------------------------------------------------------------- def write(self, base_outdir): print ' writing parameters' start = time.time() utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg')) mute_start = time.time() self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files) print ' mut freq write time: %.3f' % (time.time() - mute_start) # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached) for column in self.counts: index = None outfname = None if column == 'all': index = utils.index_columns outfname = base_outdir + '/' + utils.get_parameter_fname(column='all') elif '_content' in column: index = [column,] outfname = base_outdir + '/' + column + '.csv' else: index = [column,] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with opener('w')(outfname) as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print ' parameter write time: %.3f' % (time.time()-start)