Example #1
0
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo,
                                  exclusions=args.region_end_exclusions)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + '_gene' for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {
                n: 0
                for n in utils.nukes
            }  # base content of each insertion
            self.string_columns.add(bound + '_insertion_content')
        self.counts['cdr3_length'] = {}
        self.counts['seq_content'] = {
            n: 0
            for n in utils.nukes
        }  # now I'm adding the aa content, I wish this had nucleotide in the name, but I don't want to change it since it corresponds to a million existing file paths
        self.init_aa_stuff()
        self.counts['seq_aa_content'] = {a: 0 for a in self.all_aa}
        self.string_columns.add('seq_content')
        self.string_columns.add('seq_aa_content')

        self.no_write_columns = [
            'cdr3_length', 'seq_aa_content'
        ]  # don't write these to the parameter dir, since a) cdr3_length is better viewed as an output of more fundamental parameters (gene choice, insertion + deletion lengths) and b) I"m adding them waaay long after the others, and I don't want to add a new file to the established parameter directory structure. (I'm adding these because I want them plotted)

        self.columns_to_subset_by_gene = [
            e + '_del' for e in utils.all_erosions
        ] + [b + '_insertion' for b in utils.boundaries]
Example #2
0
 def __init__(self, germline_seqs):   #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
     self.reco_total = 0  # total number of recombination events
     self.mute_total = 0  # total number of sequences
     self.counts = {}
     self.counts['all'] = {}
     for column in utils.column_dependencies:
         self.counts[column] = {}
     for bound in utils.boundaries:
         self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes}  # base content of each insertion
     self.counts['seq_content'] = {n : 0 for n in utils.nukes}
     self.mutefreqer = MuteFreqer(germline_seqs)  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)
Example #3
0
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args

        self.new_allele_info = []

        self.small_number = 1e-5
        self.n_max_mutations_per_segment = 20  # don't look at sequences whose v segments have more than this many mutations
        self.n_max_snps = self.n_max_mutations_per_segment - 9  # try excluding up to this many bins (on the left) when doing the fit (leaves at least 9 points for fit)
        self.max_fit_length = 10  # don't fit more than this many bins for each istart (the first few positions in the fit are the most important, and if we fit too far to the right these important positions get diluted)
        self.n_muted_min = 15  # don't fit positions that have fewer mutations than this
        self.n_total_min = 15  # ...or fewer total observations than this
        self.n_five_prime_positions_to_exclude = 5  # skip positions that are too close to the 5' end of V (misassigned insertions look like snps)
        self.min_non_candidate_positions_to_fit = 10  # always fit at least a few non-candidate positions
        self.min_y_intercept = 0.15  # corresponds, roughly, to the expression level of the least common allele to which we have sensitivity
        self.default_slope_bounds = (-0.2, 0.2)  # fitting function needs some reasonable bounds from which to start
        self.big_y_icpt_bounds = (self.min_y_intercept, 1.5)  # snp-candidate positions should fit well when forced to use these bounds, but non-snp positions should fit like &*@!*
        # self.min_score = 2  # (mean ratio over snp candidates) - (first non-candidate ratio) must be greater than this
        self.min_min_candidate_ratio = 2.25  # every candidate ratio must be greater than this
        # self.max_non_candidate_ratio = 2.  # first non-candidate has to be smaller than this
        self.min_snp_big_icpt_residual = 2.  # snp candidates must have a better (smaller residual) big-intercept fit than this
        self.fitted_positions = {}  # positions that, for any <istart>, we have fit info

        self.mfreqer = MuteFreqer(glfo)
        self.gene_obs_counts = {}  # only used for allele-finding
        self.counts = {}
        self.plotvals = {}

        self.finalized = False
Example #4
0
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + '_gene' for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes}  # base content of each insertion
            self.string_columns.add(bound + '_insertion_content')
        self.counts['seq_content'] = {n : 0 for n in utils.nukes}
        self.string_columns.add('seq_content')

        self.columns_to_subset_by_gene = [e + '_del' for e in utils.all_erosions] + [b + '_insertion' for b in utils.boundaries]
 def __init__(
     self, germline_seqs
 ):  #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
     self.total = 0
     self.counts = {}
     self.counts['all'] = {}
     for column in utils.column_dependencies:
         self.counts[column] = {}
     for bound in utils.boundaries:
         self.counts[bound + '_insertion_content'] = {
             'A': 0,
             'C': 0,
             'G': 0,
             'T': 0
         }  # base content of each insertion
     self.counts['seq_content'] = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
     self.mutefreqer = MuteFreqer(
         germline_seqs
     )  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)
 def __init__(self, germline_seqs):   #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
     self.total = 0
     self.counts = {}
     self.counts['all'] = {}
     for column in utils.column_dependencies:
         self.counts[column] = {}
     for bound in utils.boundaries:
         self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0}  # base content of each insertion
     self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0}
     self.mutefreqer = MuteFreqer(germline_seqs)  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)
Example #7
0
 def __init__(self, germline_seqs):   #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
     self.reco_total = 0  # total number of recombination events
     self.mute_total = 0  # total number of sequences
     self.counts = {}
     self.counts['all'] = {}
     for column in utils.column_dependencies:
         self.counts[column] = {}
     for bound in utils.boundaries:
         self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes}  # base content of each insertion
     self.counts['seq_content'] = {n : 0 for n in utils.nukes}
     self.mutefreqer = MuteFreqer(germline_seqs)  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)
Example #8
0
 def __init__(self, glfo, args):
     self.glfo = glfo
     self.args = args
     self.mfreqer = MuteFreqer(self.glfo)
     self.reco_total = 0  # total number of recombination events
     self.mute_total = 0  # total number of sequences
     self.counts = {}
     self.counts['all'] = {}
     for column in utils.column_dependencies:
         self.counts[column] = {}
     for bound in utils.boundaries:
         self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes}  # base content of each insertion
     self.counts['seq_content'] = {n : 0 for n in utils.nukes}
Example #9
0
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts["all"] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + "_gene" for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + "_insertion_content"] = {n: 0 for n in utils.nukes}  # base content of each insertion
            self.string_columns.add(bound + "_insertion_content")
        self.counts["seq_content"] = {n: 0 for n in utils.nukes}
        self.string_columns.add("seq_content")

        self.columns_to_subset_by_gene = [e + "_del" for e in utils.real_erosions + utils.effective_erosions] + [
            b + "_insertion" for b in utils.boundaries
        ]
Example #10
0
class AlleleFinder(object):
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args

        self.new_allele_info = []

        self.small_number = 1e-5
        self.n_max_mutations_per_segment = 20  # don't look at sequences whose v segments have more than this many mutations
        self.n_max_snps = self.n_max_mutations_per_segment - 9  # try excluding up to this many bins (on the left) when doing the fit (leaves at least 9 points for fit)
        self.max_fit_length = 10  # don't fit more than this many bins for each istart (the first few positions in the fit are the most important, and if we fit too far to the right these important positions get diluted)
        self.n_muted_min = 15  # don't fit positions that have fewer mutations than this
        self.n_total_min = 15  # ...or fewer total observations than this
        self.n_five_prime_positions_to_exclude = 5  # skip positions that are too close to the 5' end of V (misassigned insertions look like snps)
        self.min_non_candidate_positions_to_fit = 10  # always fit at least a few non-candidate positions
        self.min_y_intercept = 0.15  # corresponds, roughly, to the expression level of the least common allele to which we have sensitivity
        self.default_slope_bounds = (-0.2, 0.2)  # fitting function needs some reasonable bounds from which to start
        self.big_y_icpt_bounds = (self.min_y_intercept, 1.5)  # snp-candidate positions should fit well when forced to use these bounds, but non-snp positions should fit like &*@!*
        # self.min_score = 2  # (mean ratio over snp candidates) - (first non-candidate ratio) must be greater than this
        self.min_min_candidate_ratio = 2.25  # every candidate ratio must be greater than this
        # self.max_non_candidate_ratio = 2.  # first non-candidate has to be smaller than this
        self.min_snp_big_icpt_residual = 2.  # snp candidates must have a better (smaller residual) big-intercept fit than this
        self.fitted_positions = {}  # positions that, for any <istart>, we have fit info

        self.mfreqer = MuteFreqer(glfo)
        self.gene_obs_counts = {}  # only used for allele-finding
        self.counts = {}
        self.plotvals = {}

        self.finalized = False

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.mfreqer.increment(info)

        for region in utils.regions:
            regional_freq, len_excluding_ambig = utils.get_mutation_rate(info, restrict_to_region=region, return_len_excluding_ambig=True)
            n_mutes = regional_freq * len_excluding_ambig  # total number of mutations in the region (for allele finding stuff)
            if abs(n_mutes - int(n_mutes)) > 1e6:
                raise Exception('n mutated %f not an integer' % n_mutes)
            n_mutes = int(n_mutes)

            gene = info[region + '_gene']
            if gene not in self.counts:
                self.counts[gene] = {}
                self.gene_obs_counts[gene] = 0
            self.gene_obs_counts[gene] += 1

            gcts = self.counts[gene]  # shorthand name

            germline_seq = info[region + '_gl_seq']
            query_seq = info[region + '_qr_seq']
            assert len(germline_seq) == len(query_seq)

            for ipos in range(len(germline_seq)):
                igl = ipos + int(info[region + '_5p_del'])  # account for left-side deletions in the indexing

                if germline_seq[ipos] in utils.ambiguous_bases or query_seq[ipos] in utils.ambiguous_bases:  # skip if either germline or query sequence is ambiguous at this position
                    continue

                if igl not in gcts:  # if we have not yet observed this position in a query sequence, initialize it
                    gcts[igl] = {}

                if igl not in gcts:
                    gcts[igl] = {}
                if utils.get_region(gene) == 'v':
                    if n_mutes not in gcts[igl]:
                        gcts[igl][n_mutes] = {n : 0 for n in ['muted', 'total'] + utils.nukes}
                    gcts[igl][n_mutes]['total'] += 1
                    if query_seq[ipos] != germline_seq[ipos]:  # if this position is mutated
                        gcts[igl][n_mutes]['muted'] += 1  # mark that we saw this germline position mutated once in a sequence with <n_mutes> regional mutation frequency
                    gcts[igl][n_mutes][query_seq[ipos]] += 1  # only used to work out what the snp'd base is if there's a new allele

    # ----------------------------------------------------------------------------------------
    def get_residual_sum(self, xvals, yvals, errs, slope, intercept):
        def expected(x):
            return slope * x + intercept
        residual_sum = sum([(y - expected(x))**2 / err**2 for x, y, err in zip(xvals, yvals, errs)])
        return residual_sum

    # ----------------------------------------------------------------------------------------
    def get_curvefit(self, n_mutelist, freqs, errs, y_icpt_bounds=None):
        def func(x, slope, y_icpt):
            return slope*x + y_icpt

        bounds = (-float('inf'), float('inf'))
        if y_icpt_bounds is not None:
            bounds = [[s, y] for s, y in zip(self.default_slope_bounds, y_icpt_bounds)]
        params, cov = scipy.optimize.curve_fit(func, n_mutelist, freqs, sigma=errs, bounds=bounds)
        slope, slope_err = params[0], math.sqrt(cov[0][0])
        y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1])
        residual_sum = self.get_residual_sum(n_mutelist, freqs, errs, slope, y_icpt)
        ndof = len(n_mutelist) - 1
        fitfo = {
            'slope'  : slope,
            'y_icpt' : y_icpt,
            'slope_err'  : slope_err,
            'y_icpt_err' : y_icpt_err,
            'residuals_over_ndof' : float(residual_sum) / ndof,
            'print_str' : '    %9.3f +/- %-9.3f   %7.4f +/- %7.4f    %7.4f' % (y_icpt, y_icpt_err, slope, slope_err, float(residual_sum) / ndof),
            'n_mutelist' : n_mutelist,
            'freqs' : freqs,
            'errs' : errs
        }

        return fitfo

    # ----------------------------------------------------------------------------------------
    def get_allele_finding_xyvals(self, gene, position):
        gpcounts = self.counts[gene][position]
        iterinfo = gpcounts.items()

        obs = [d['muted'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]

        lohis = [fraction_uncertainty.err(d['muted'], d['total'], use_beta=True) for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]
        errs = [(hi - lo) / 2 for lo, hi, _ in lohis]
        weights = [1./(e*e) for e in errs]

        freqs = [float(d['muted']) / d['total'] if d['total'] > 0 else 0. for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]
        total = [d['total'] for nm, d in iterinfo if nm < self.n_max_mutations_per_segment]

        n_mutelist = [nm for nm in gpcounts.keys() if nm < self.n_max_mutations_per_segment]

        return {'obs' : obs, 'total' : total, 'n_mutelist' : n_mutelist, 'freqs' : freqs, 'errs' : errs, 'weights' : weights}

    # ----------------------------------------------------------------------------------------
    def is_a_candidate(self, gene, fitfo, istart, debug=False):
        # NOTE I've tried adding a requirement on the actual value of the big-icpt (or, equivalently, small-icpt) fit, but it seems to be better to just use the ratio (probably because stuff is correlated)
        if fitfo['min_snp_ratios'][istart] < self.min_min_candidate_ratio:  # worst snp candidate has to be pretty good on its own
            if debug:
                print '    min snp ratio %s too small (less than %s)' % (fstr(fitfo['min_snp_ratios'][istart]), fstr(self.min_min_candidate_ratio))
            return False
        for candidate_pos in fitfo['candidates'][istart]:  # return false if any of the candidate positions don't have enough counts with <istart> mutations (probably a homozygous new allele with more than <istart> snps)
            if istart not in self.counts[gene][candidate_pos] or self.counts[gene][candidate_pos][istart]['total'] < self.n_total_min:
                if debug:
                    print '    not enough counts at this position with %d mutations (%s < %s)' % (istart, fstr(self.counts[gene][candidate_pos][istart]['total']), fstr(self.n_total_min))
                return False

        if debug:
            print '    candidate'
        return True

    # ----------------------------------------------------------------------------------------
    def get_positions_to_fit(self, gene, gene_results, debug=False):
        self.fitted_positions[gene] = set()

        positions = sorted(self.mfreqer.counts[gene].keys())
        xyvals = {pos : self.get_allele_finding_xyvals(gene, pos) for pos in positions}
        positions_to_try_to_fit = [pos for pos in positions if sum(xyvals[pos]['obs']) > self.n_muted_min or sum(xyvals[pos]['total']) > self.n_total_min]  # ignore positions with neither enough mutations or total observations
        if len(positions_to_try_to_fit) < self.n_max_snps - 1 + self.min_non_candidate_positions_to_fit:
            gene_results['not_enough_obs_to_fit'].add(gene)
            if debug:
                print '          not enough positions with enough observations to fit %s' % utils.color_gene(gene)
                return None, None
        if debug and len(positions) > len(positions_to_try_to_fit):
            print '          skipping %d / %d positions (with fewer than %d mutations and %d observations)' % (len(positions) - len(positions_to_try_to_fit), len(positions), self.n_muted_min, self.n_total_min)

        self.plotvals[gene] = {}
        for pos in positions_to_try_to_fit:
            self.plotvals[gene][pos] = xyvals[pos]

        return positions_to_try_to_fit, xyvals

    # ----------------------------------------------------------------------------------------
    def fit_istart(self, gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=False):
        residuals = {}
        for pos in positions_to_try_to_fit:
            # skip positions that are too close to the 5' end of V (misassigned insertions look like snps)
            if pos > len(self.glfo['seqs'][utils.get_region(gene)][gene]) - self.n_five_prime_positions_to_exclude - 1:
                continue

            # as long as we already have a few non-candidate positions, skip positions that have no frequencies greater than the min y intercept (note that they could in principle still have a large y intercept, but we don't really care)
            if len(residuals) > istart + self.min_non_candidate_positions_to_fit and len([f for f in subxyvals[pos]['freqs'] if f > self.min_y_intercept]) == 0:
                continue

            if sum(subxyvals[pos]['total']) < self.n_total_min:
                continue

            # also skip positions that only have a few points to fit (i.e. genes that were very rare, or I guess maybe if they were always eroded past this position)
            if len(subxyvals[pos]['n_mutelist']) < 3:
                continue

            zero_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=(0. - self.small_number, 0. + self.small_number))
            big_icpt_fit = self.get_curvefit(subxyvals[pos]['n_mutelist'], subxyvals[pos]['freqs'], subxyvals[pos]['errs'], y_icpt_bounds=self.big_y_icpt_bounds)

            residuals[pos] = {'zero_icpt' : zero_icpt_fit['residuals_over_ndof'], 'big_icpt' : big_icpt_fit['residuals_over_ndof']}

            self.fitted_positions[gene].add(pos)  # if we already did the fit for another <istart>, it'll already be in there

        if len(residuals) <= istart:  # needs to be at least one longer, so we have the first-non-snp
            if debug:
                print '      not enough observations to fit more than %d snps' % (istart - 1)
            return

        residual_ratios = {pos : float('inf') if r['big_icpt'] == 0. else r['zero_icpt'] / r['big_icpt'] for pos, r in residuals.items()}
        sorted_ratios = sorted(residual_ratios.items(), key=operator.itemgetter(1), reverse=True)  # sort the positions in decreasing order of residual ratio
        candidate_snps = [pos for pos, _ in sorted_ratios[:istart]]  # the first <istart> positions are the "candidate snps"
        max_non_snp, max_non_snp_ratio = sorted_ratios[istart]  # position and ratio for largest non-candidate
        min_candidate_ratio = min([residual_ratios[cs] for cs in candidate_snps])

        # fitfo['scores'][istart] = (min_candidate_ratio - max_non_snp_ratio) / max(self.small_number, max_non_snp_ratio)
        fitfo['min_snp_ratios'][istart] = min([residual_ratios[cs] for cs in candidate_snps])
        fitfo['candidates'][istart] = {cp : residual_ratios[cp] for cp in candidate_snps}

        if debug:
            # if debug > 1:
            #     print '%70s %s' % ('', ''.join(['%11d' % nm for nm in subxyvals[max_non_snp]['n_mutelist']]))
            for pos in candidate_snps + [max_non_snp, ]:
                xtrastrs = ('[', ']') if pos == max_non_snp else (' ', ' ')
                pos_str = '%3s' % str(pos)
                if residual_ratios[pos] > self.min_min_candidate_ratio:
                    pos_str = utils.color('yellow', pos_str)
                print '               %s %s    %5s   (%5s / %-5s)       %4d / %-4d %s' % (xtrastrs[0], pos_str, fstr(residual_ratios[pos]),
                                                                                       fstr(residuals[pos]['zero_icpt']), fstr(residuals[pos]['big_icpt']),
                                                                                       sum(subxyvals[pos]['obs']), sum(subxyvals[pos]['total']), xtrastrs[1]),
                # if debug > 1:
                #     print '      ', ''.join(['%4d / %-4d' % (subxyvals[pos]['obs'][inm], subxyvals[pos]['total'][inm]) for inm in range(len(subxyvals[pos]['n_mutelist']))])
                print ''

    # ----------------------------------------------------------------------------------------
    def add_new_allele(self, gene, fitfo, n_candidate_snps, debug=False):
        # figure out what the new nukes are
        old_seq = self.glfo['seqs'][utils.get_region(gene)][gene]
        new_seq = old_seq
        mutfo = {}
        for pos in sorted(fitfo['candidates'][n_candidate_snps]):
            obs_counts = {nuke : self.counts[gene][pos][n_candidate_snps][nuke] for nuke in utils.nukes}  # NOTE it's super important to only use the counts from sequences with <n_candidate_snps> total mutations
            sorted_obs_counts = sorted(obs_counts.items(), key=operator.itemgetter(1), reverse=True)
            original_nuke = self.mfreqer.counts[gene][pos]['gl_nuke']
            new_nuke = None
            for nuke, _ in sorted_obs_counts:  # take the most common one that isn't the existing gl nuke
                if nuke != original_nuke:
                    new_nuke = nuke
                    break
            print '   %3d  (%s --> %s)' % (pos, original_nuke, new_nuke),
            assert old_seq[pos] == original_nuke
            mutfo[pos] = {'original' : original_nuke, 'new' : new_nuke}
            new_seq = new_seq[:pos] + new_nuke + new_seq[pos+1:]

        new_name, mutfo = glutils.get_new_allele_name_and_change_mutfo(gene, mutfo)
        print ''
        print '          %s   %s' % (old_seq, utils.color_gene(gene))
        print '          %s   %s' % (utils.color_mutants(old_seq, new_seq), utils.color_gene(new_name))

        # and add it to the set of new alleles for this gene
        self.new_allele_info.append({
            'template-gene' : gene,
            'gene' : new_name,
            'seq' : new_seq,
            'aligned-seq' : None
        })

    # ----------------------------------------------------------------------------------------
    def finalize(self, debug=False):
        assert not self.finalized

        self.mfreqer.finalize()

        start = time.time()
        gene_results = {'not_enough_obs_to_fit' : set(), 'didnt_find_anything_with_fit' : set(), 'new_allele' : set()}
        if debug:
            print '\nlooking for new alleles:'
        for gene in sorted(self.mfreqer.counts):
            if utils.get_region(gene) != 'v':
                continue
            if debug:
                print '\n%s (observed %d %s)' % (utils.color_gene(gene), self.gene_obs_counts[gene], utils.plural_str('time', self.gene_obs_counts[gene]))

            positions_to_try_to_fit, xyvals = self.get_positions_to_fit(gene, gene_results, debug=debug)
            if positions_to_try_to_fit is None:
                continue

            fitfo = {n : {} for n in ('min_snp_ratios', 'candidates')}
            for istart in range(1, self.n_max_snps):
                if debug:
                    if istart == 1:
                        print '                                 resid. / ndof'
                        print '             position   ratio   (m=0 / m>%5.2f)       muted / obs ' % self.big_y_icpt_bounds[0]
                    print '  %d %s' % (istart, utils.plural_str('snp', istart))

                subxyvals = {pos : {k : v[istart : istart + self.max_fit_length] for k, v in xyvals[pos].items()} for pos in positions_to_try_to_fit}
                self.fit_istart(gene, istart, positions_to_try_to_fit, subxyvals, fitfo, debug=debug)
                if istart not in fitfo['candidates']:  # if it didn't get filled, we didn't have enough observations to do the fit
                    break

            istart_candidates = []
            if debug:
                print '  evaluating each snp hypothesis'
                print '    snps       min ratio'
            for istart in fitfo['candidates']:
                if debug:
                    print '    %2d     %9s' % (istart, fstr(fitfo['min_snp_ratios'][istart])),
                if self.is_a_candidate(gene, fitfo, istart, debug=debug):
                    istart_candidates.append(istart)

            if len(istart_candidates) > 0:
                n_candidate_snps = min(istart_candidates)  # add the candidate with the smallest number of snps to the germline set, and run again (if the firs
                gene_results['new_allele'].add(gene)
                print '\n    found a new allele candidate separated from %s by %d %s at %s:' % (utils.color_gene(gene), n_candidate_snps,
                                                                                                utils.plural_str('snp', n_candidate_snps), utils.plural_str('position', n_candidate_snps)),
                self.add_new_allele(gene, fitfo, n_candidate_snps, debug=debug)
            else:
                gene_results['didnt_find_anything_with_fit'].add(gene)
                if debug:
                    print '  no new alleles'

        if debug:
            print 'found new alleles for %d %s (there were also %d without new alleles, and %d without enough observations to fit)' % (len(gene_results['new_allele']), utils.plural_str('gene', len(gene_results['new_allele'])),
                                                                                                                                       len(gene_results['didnt_find_anything_with_fit']), len(gene_results['not_enough_obs_to_fit']))
            print '      allele finding time: %.1f' % (time.time()-start)

        self.finalized = True

    # ----------------------------------------------------------------------------------------
    def plot(self, base_plotdir, only_csv=False):
        if not self.finalized:
            self.finalize(debug=debug)

        plotdir = base_plotdir + '/allele-finding'

        for old_gene_dir in glob.glob(plotdir + '/*'):  # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir
            if not os.path.isdir(old_gene_dir):
                raise Exception('not a directory: %s' % old_gene_dir)
            utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg'))
            os.rmdir(old_gene_dir)
        utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg'))

        if only_csv:  # not implemented
            return

        start = time.time()
        for gene in self.plotvals:
            if utils.get_region(gene) != 'v':
                continue

            for position in self.plotvals[gene]:
                if position not in self.fitted_positions[gene]:  # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow
                    continue
                # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None:
                #     continue
                plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position])
        print '      allele finding plot time: %.1f' % (time.time()-start)
Example #11
0
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo,
                                  exclusions=args.region_end_exclusions)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + '_gene' for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {
                n: 0
                for n in utils.nukes
            }  # base content of each insertion
            self.string_columns.add(bound + '_insertion_content')
        self.counts['aa_cdr3_length'] = {}
        self.counts['non_vj_length'] = {}
        self.counts['seq_content'] = {
            n: 0
            for n in utils.nukes
        }  # now I'm adding the aa content, I wish this had nucleotide in the name, but I don't want to change it since it corresponds to a million existing file paths
        self.counts['cluster_size'] = {}
        self.init_aa_stuff()
        self.counts['seq_aa_content'] = {a: 0 for a in self.all_aa}
        self.string_columns.add('seq_content')
        self.string_columns.add('seq_aa_content')

        self.no_write_columns = [
            'aa_cdr3_length', 'non_vj_length', 'seq_aa_content'
        ]  # don't write these to the parameter dir, since a) cdr3 length is better viewed as an output of more fundamental parameters (gene choice, insertion + deletion lengths) and b) I"m adding them waaay long after the others, and I don't want to add a new file to the established parameter directory structure. (I'm adding these because I want them plotted)

        self.columns_to_subset_by_gene = [
            e + '_del' for e in utils.all_erosions
        ] + [b + '_insertion' for b in utils.boundaries]
        self.mean_columns = ['aa_cdr3_length', 'non_vj_length']

    # ----------------------------------------------------------------------------------------
    def init_aa_stuff(self):
        codons = itertools.product(
            utils.nukes + ['N'], repeat=3
        )  # I cannot for the life of me find anything in Bio that will give me the list of amino acids, wtf, but I'm tired of googling, this will be fine
        self.all_aa = set([utils.ltranslate(''.join(c)) for c in codons])

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == '_insertion':  # insertion length
                index.append(len(info[ic]))
            else:
                assert 'insertion' not in ic
                assert 'content' not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.increment_per_family_params(info)
        for iseq in range(len(info['seqs'])):
            self.increment_per_sequence_params(info, iseq)

    # ----------------------------------------------------------------------------------------
    def increment_per_sequence_params(self, info, iseq):
        """ increment parameters that differ for each sequence within the clonal family """
        self.mute_total += 1
        self.mfreqer.increment(info, iseq)
        for nuke in utils.nukes:
            self.counts['seq_content'][nuke] += info['seqs'][iseq].count(nuke)

        # aa seq content stuff
        nseq = info['seqs'][iseq]
        if info['v_5p_del'] > 0:
            nseq = info['v_5p_del'] * utils.ambig_base + nseq
        if len(info['fv_insertion']) > 0:
            nseq = nseq[len(info['fv_insertion']):]
        if len(nseq) % 3 != 0:
            nseq += utils.ambig_base * (
                3 - (len(nseq) % 3)
            )  # I think I could replace this with the new utils.pad_nuc_seq()
        aaseq = utils.ltranslate(nseq)
        for aa in self.all_aa:
            self.counts['seq_aa_content'][aa] += aaseq.count(aa)

    # ----------------------------------------------------------------------------------------
    def increment_per_family_params(self, info):
        """ increment parameters that are the same for the entire clonal family """
        def sub_increment(column, index):
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        self.reco_total += 1

        all_index = self.get_index(
            info, tuple(list(utils.index_columns) + [
                'cdr3_length',
            ])
        )  # NOTE this cdr3_length is for getting a unique index for the rearrangement event parameters, and is thus unrelated to the key aa_cdr3_length for plotting
        if all_index not in self.counts['all']:
            self.counts['all'][all_index] = 0
        self.counts['all'][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            sub_increment(column, index)

        # have to be done separately, since they're not index columns (and we don't want them to be, since they're better viewed as derivative -- see note in self.write())
        sub_increment(
            'aa_cdr3_length', (info['cdr3_length'] / 3, )
        )  # oh, jeez, this has to be a tuple to match the index columns, that's ugly
        sub_increment('non_vj_length', (utils.get_non_vj_len(info), ))
        sub_increment('cluster_size', (len(info['unique_ids']), ))

        for bound in utils.boundaries:
            for nuke in info[bound + '_insertion']:
                if nuke == utils.ambig_base:
                    continue
                self.counts[bound + '_insertion_content'][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def clean_plots(self, plotdir):
        self.mfreqer.clean_plots(plotdir + '/mute-freqs')
        utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg'))
        for column in self.counts:
            if column in self.columns_to_subset_by_gene:
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])

    # ----------------------------------------------------------------------------------------
    def plot(
        self,
        plotdir,
        only_csv=False,
        only_overall=False,
        make_per_base_plots=False
    ):  # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it write() wasn't called first, that is)
        import plotting
        print '  plotting parameters in %s' % plotdir,
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + '/mute-freqs',
                          only_csv=only_csv,
                          only_overall=only_overall,
                          make_per_base_plots=make_per_base_plots)

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = 'string' if column in self.string_columns else 'int'

            hist = hutils.make_hist_from_dict_of_counts(
                values, var_type, column)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv,
                stats='mean' if column in self.mean_columns else None,
                normalize=True)

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = hutils.make_hist_from_dict_of_counts(
                        gene_values[gene], var_type, plotname)
                    plotting.draw_no_root(hist,
                                          plotname=plotname,
                                          plotdir=thisplotdir,
                                          xtitle=plotconfig.plot_titles.get(
                                              column, column),
                                          plottitle=gene,
                                          errors=True,
                                          write_csv=True,
                                          only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time() - start)

    # ----------------------------------------------------------------------------------------
    def write(
        self, base_outdir
    ):  # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it plot() wasn't called first, that is)
        print '    writing parameters to %s' % base_outdir,
        sys.stdout.flush()
        start = time.time()

        if os.path.exists(base_outdir + '/' + glutils.glfo_dir):
            for tmploc in [
                    l for l in utils.loci
                    if os.path.exists(base_outdir + '/' + glutils.glfo_dir +
                                      '/' + l)
            ]:
                glutils.remove_glfo_files(base_outdir + '/' + glutils.glfo_dir,
                                          tmploc,
                                          print_warning=False)
        utils.prep_dir(
            base_outdir,
            subdirs=('hmms', 'mute-freqs', glutils.glfo_dir),
            wildlings=('*.csv', '*.yaml', '*.fasta')
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + '/mute-freqs',
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [
            g[0] for r in utils.regions
            for g in self.counts[r + '_gene'].keys()
        ]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir,
                           self.glfo,
                           only_genes=genes_with_counts,
                           debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column in self.no_write_columns:
                continue
            elif column == 'all':
                index = tuple(list(utils.index_columns) + [
                    'cdr3_length',
                ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column or column == 'cluster_size':
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time() - start)
Example #12
0
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {n : 0 for n in utils.nukes}  # base content of each insertion
        self.counts['seq_content'] = {n : 0 for n in utils.nukes}

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == '_insertion':  # insertion length
                index.append(len(info[ic]))
            else:
                assert 'insertion' not in ic
                assert 'content' not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment_all_params(self, info):
        self.increment_per_sequence_params(info)
        self.increment_per_family_params(info)

    # ----------------------------------------------------------------------------------------
    def increment_per_sequence_params(self, info):
        """ increment parameters that differ for each sequence within the clonal family """
        self.mute_total += 1
        self.mfreqer.increment(info)
        seq = info['seq']
        for nuke in seq:
            if nuke in utils.ambiguous_bases:
                continue
            self.counts['seq_content'][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def increment_per_family_params(self, info):
        """ increment parameters that are the same for the entire clonal family """
        self.reco_total += 1

        all_index = self.get_index(info, tuple(list(utils.index_columns) + ['cdr3_length', ]))
        if all_index not in self.counts['all']:
            self.counts['all'][all_index] = 0
        self.counts['all'][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        for bound in utils.boundaries:
            for nuke in info[bound + '_insertion']:
                if nuke in utils.ambiguous_bases:
                    continue
                self.counts[bound + '_insertion_content'][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def clean_plots(self, plotdir, subset_by_gene):
        self.mfreqer.clean_plots(plotdir + '/mute-freqs')
        utils.prep_dir(plotdir + '/overall')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])

    # ----------------------------------------------------------------------------------------
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False):
        print '  plotting parameters',
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir, subset_by_gene)

        self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                raise Exception('no counts in %s' % column)
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time()-start)

    # ----------------------------------------------------------------------------------------
    def write(self, base_outdir, my_datadir=None):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """
    def __init__(
        self, germline_seqs
    ):  #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
        self.total = 0
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {
                'A': 0,
                'C': 0,
                'G': 0,
                'T': 0
            }  # base content of each insertion
        self.counts['seq_content'] = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
        self.mutefreqer = MuteFreqer(
            germline_seqs
        )  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)

    # ----------------------------------------------------------------------------------------
    def clean(self):
        """ remove all the parameter files """
        self.mutefreqer.clean()
        for column in self.counts:
            if column == 'all':
                os.remove(self.base_outdir + '/' +
                          utils.get_parameter_fname(column='all'))
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                os.remove(self.base_outdir + '/' +
                          utils.get_parameter_fname(column_and_deps=index))

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == '_insertion':  # insertion length
                index.append(len(info[ic]))
            else:
                assert 'insertion' not in ic
                assert 'content' not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.total += 1

        all_index = self.get_index(info, utils.index_columns)
        if all_index not in self.counts['all']:
            self.counts['all'][all_index] = 0
        self.counts['all'][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        for bound in utils.boundaries:
            for nuke in info[bound + '_insertion']:
                self.counts[bound + '_insertion_content'][nuke] += 1
        for nuke in info['seq']:
            self.counts['seq_content'][nuke] += 1

        self.mutefreqer.increment(info)

    # ----------------------------------------------------------------------------------------
    def __str__(self):
        return_str = []
        print 'hm I think I was too lazy to put \'all\' in this string'
        print '  or [vdj]_insertion_content or seq_content'
        for column in self.counts:
            return_str.append('%s\n' % column)
            return_str.append('%20s' % column)
            for dep in utils.column_dependencies[column]:
                return_str.append('%20s' % dep)
            return_str.append('\n')
            for index, count in self.counts[column].iteritems():
                for val in index:
                    return_str.append('%20s' % str(val))
                return_str.append(
                    '   %d / %d = %f\n' %
                    (count, self.total, float(count) / self.total))
        return ''.join(return_str)

    # ----------------------------------------------------------------------------------------
    def plot(self,
             plotdir,
             subset_by_gene=False,
             cyst_positions=None,
             tryp_positions=None):
        print '  plotting parameters'
        start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and (
                        '_del' in column or column == 'vd_insertion'
                        or column == 'dj_insertion'
                ):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[
                        1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[
                        1]  #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and (
                    '_del' in column or column == 'vd_insertion'
                    or column == 'dj_insertion'
            ):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots',
                               multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(
                        gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist,
                                  var_type,
                                  plotname=plotname,
                                  plotdir=thisplotdir,
                                  errors=True,
                                  write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(
                    ['./bin/permissify-www', thisplotdir]
                )  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values,
                                                          var_type,
                                                          plotname,
                                                          sort=True)
            plotting.draw(hist,
                          var_type,
                          plotname=plotname,
                          plotdir=plotdir,
                          errors=True,
                          write_csv=True)

        self.mutefreqer.plot(
            plotdir, cyst_positions, tryp_positions
        )  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(
                ['./bin/permissify-www', plotdir]
            )  # NOTE this should really permissify starting a few directories higher up

        print '    parameter plot time: %.3f' % (time.time() - start)

    # ----------------------------------------------------------------------------------------
    def write(self, base_outdir):
        print '  writing parameters'
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        mute_start = time.time()
        self.mutefreqer.write(
            base_outdir,
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column:
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '    parameter write time: %.3f' % (time.time() - start)
Example #14
0
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """
    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo,
                                  exclusions=args.region_end_exclusions)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + '_gene' for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {
                n: 0
                for n in utils.nukes
            }  # base content of each insertion
            self.string_columns.add(bound + '_insertion_content')
        self.counts['seq_content'] = {n: 0 for n in utils.nukes}
        self.string_columns.add('seq_content')

        self.columns_to_subset_by_gene = [
            e + '_del' for e in utils.all_erosions
        ] + [b + '_insertion' for b in utils.boundaries]

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == '_insertion':  # insertion length
                index.append(len(info[ic]))
            else:
                assert 'insertion' not in ic
                assert 'content' not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.increment_per_family_params(info)
        for iseq in range(len(info['seqs'])):
            self.increment_per_sequence_params(info, iseq)

    # ----------------------------------------------------------------------------------------
    def increment_per_sequence_params(self, info, iseq):
        """ increment parameters that differ for each sequence within the clonal family """
        self.mute_total += 1
        self.mfreqer.increment(info, iseq)
        for nuke in info['seqs'][iseq]:
            if nuke in utils.ambiguous_bases:
                continue
            self.counts['seq_content'][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def increment_per_family_params(self, info):
        """ increment parameters that are the same for the entire clonal family """
        self.reco_total += 1

        all_index = self.get_index(
            info, tuple(list(utils.index_columns) + [
                'cdr3_length',
            ]))
        if all_index not in self.counts['all']:
            self.counts['all'][all_index] = 0
        self.counts['all'][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        for bound in utils.boundaries:
            for nuke in info[bound + '_insertion']:
                if nuke in utils.ambiguous_bases:
                    continue
                self.counts[bound + '_insertion_content'][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def clean_plots(self, plotdir):
        self.mfreqer.clean_plots(plotdir + '/mute-freqs')
        utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg'))
        for column in self.counts:
            if column in self.columns_to_subset_by_gene:
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])

    # ----------------------------------------------------------------------------------------
    def plot(self, plotdir, only_csv=False, only_overall=False):
        import plotting
        print '  plotting parameters',
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + '/mute-freqs',
                          only_csv=only_csv,
                          only_overall=only_overall)

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = 'string' if column in self.string_columns else 'int'

            hist = plotting.make_hist_from_dict_of_counts(values,
                                                          var_type,
                                                          column,
                                                          sort=True)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv)

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(
                        gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(hist,
                                          plotname=plotname,
                                          plotdir=thisplotdir,
                                          xtitle=plotconfig.plot_titles.get(
                                              column, column),
                                          plottitle=gene,
                                          errors=True,
                                          write_csv=True,
                                          only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time() - start)

    # ----------------------------------------------------------------------------------------
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        if os.path.exists(base_outdir + '/' + glutils.glfo_dir):
            glutils.remove_glfo_files(
                base_outdir + '/' + glutils.glfo_dir, self.glfo['locus']
            )  # NOTE I think this will fail if I ever start having multiple loci in one dir
        utils.prep_dir(
            base_outdir,
            subdirs=('hmms', 'mute-freqs', glutils.glfo_dir),
            wildlings=('*.csv', '*.yaml', '*.fasta')
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + '/mute-freqs',
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [
            g[0] for r in utils.regions
            for g in self.counts[r + '_gene'].keys()
        ]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir,
                           self.glfo,
                           only_genes=genes_with_counts,
                           debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + [
                    'cdr3_length',
                ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column:
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time() - start)
Example #15
0
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """

    def __init__(self, glfo, args):
        self.glfo = glfo
        self.args = args
        self.mfreqer = MuteFreqer(self.glfo)
        self.reco_total = 0  # total number of recombination events
        self.mute_total = 0  # total number of sequences
        self.counts = {}
        self.counts["all"] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        self.string_columns = set([r + "_gene" for r in utils.regions])
        for bound in utils.boundaries:
            self.counts[bound + "_insertion_content"] = {n: 0 for n in utils.nukes}  # base content of each insertion
            self.string_columns.add(bound + "_insertion_content")
        self.counts["seq_content"] = {n: 0 for n in utils.nukes}
        self.string_columns.add("seq_content")

        self.columns_to_subset_by_gene = [e + "_del" for e in utils.real_erosions + utils.effective_erosions] + [
            b + "_insertion" for b in utils.boundaries
        ]

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == "_insertion":  # insertion length
                index.append(len(info[ic]))
            else:
                assert "insertion" not in ic
                assert "content" not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.increment_per_family_params(info)
        for iseq in range(len(info["seqs"])):
            self.increment_per_sequence_params(info, iseq)

    # ----------------------------------------------------------------------------------------
    def increment_per_sequence_params(self, info, iseq):
        """ increment parameters that differ for each sequence within the clonal family """
        self.mute_total += 1
        self.mfreqer.increment(info, iseq)
        for nuke in info["seqs"][iseq]:
            if nuke in utils.ambiguous_bases:
                continue
            self.counts["seq_content"][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def increment_per_family_params(self, info):
        """ increment parameters that are the same for the entire clonal family """
        self.reco_total += 1

        all_index = self.get_index(info, tuple(list(utils.index_columns) + ["cdr3_length"]))
        if all_index not in self.counts["all"]:
            self.counts["all"][all_index] = 0
        self.counts["all"][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        for bound in utils.boundaries:
            for nuke in info[bound + "_insertion"]:
                if nuke in utils.ambiguous_bases:
                    continue
                self.counts[bound + "_insertion_content"][nuke] += 1

    # ----------------------------------------------------------------------------------------
    def clean_plots(self, plotdir):
        self.mfreqer.clean_plots(plotdir + "/mute-freqs")
        utils.prep_dir(plotdir + "/overall")  # , multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column in self.columns_to_subset_by_gene:
                thisplotdir = plotdir + "/" + column
                utils.prep_dir(thisplotdir, wildlings=["*.csv", "*.svg"])

    # ----------------------------------------------------------------------------------------
    def plot(self, plotdir, only_csv=False, only_overall=False):
        print "  plotting parameters",
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall)

        overall_plotdir = plotdir + "/overall"

        for column in self.counts:
            if column == "all":
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1
                    ]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = "string" if column in self.string_columns else "int"

            hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv,
            )

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + "/" + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + "-" + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(
                        hist,
                        plotname=plotname,
                        plotdir=thisplotdir,
                        xtitle=plotconfig.plot_titles.get(column, column),
                        plottitle=gene,
                        errors=True,
                        write_csv=True,
                        only_csv=only_csv,
                    )
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print "(%.1f sec)" % (time.time() - start)

    # ----------------------------------------------------------------------------------------
    def write(self, base_outdir):
        print "    writing parameters",
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(
            base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta")
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv"
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()]
        glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == "all":
                index = tuple(list(utils.index_columns) + ["cdr3_length"])
                outfname = base_outdir + "/" + utils.get_parameter_fname(column="all")
            elif "_content" in column:
                index = [column]
                outfname = base_outdir + "/" + column + ".csv"
            else:
                index = [column] + utils.column_dependencies[column]
                outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener("w")(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append("count")
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line["count"] = count
                    out_data.writerow(line)

        print "(%.1f sec)" % (time.time() - start)
class ParameterCounter(object):
    """ class to keep track of how many times we've seen each gene version, erosion length,
    insertion (length and base content), and mutation """
    def __init__(self, germline_seqs):   #, base_outdir='', plotdir='', write_parameters=True, plot_parameters=True):
        self.total = 0
        self.counts = {}
        self.counts['all'] = {}
        for column in utils.column_dependencies:
            self.counts[column] = {}
        for bound in utils.boundaries:
            self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0}  # base content of each insertion
        self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0}
        self.mutefreqer = MuteFreqer(germline_seqs)  #, self.base_outdir, self.plotdir, write_parameters=self.write_parameters, plot_parameters=self.plot_parameters)

    # ----------------------------------------------------------------------------------------
    def clean(self):
        """ remove all the parameter files """
        self.mutefreqer.clean()
        for column in self.counts:
            if column == 'all':
                os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column='all'))
            else:
                index = [column,] + utils.column_dependencies[column]
                os.remove(self.base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index))

    # ----------------------------------------------------------------------------------------
    def get_index(self, info, deps):
        index = []
        for ic in deps:
            if ic[2:] == '_insertion':  # insertion length
                index.append(len(info[ic]))
            else:
                assert 'insertion' not in ic
                assert 'content' not in ic
                index.append(info[ic])
        return tuple(index)

    # ----------------------------------------------------------------------------------------
    def increment(self, info):
        self.total += 1

        all_index = self.get_index(info, utils.index_columns)
        if all_index not in self.counts['all']:
            self.counts['all'][all_index] = 0
        self.counts['all'][all_index] += 1

        for deps in utils.column_dependency_tuples:
            column = deps[0]
            index = self.get_index(info, deps)
            if index not in self.counts[column]:
                self.counts[column][index] = 0
            self.counts[column][index] += 1

        for bound in utils.boundaries:
            for nuke in info[bound + '_insertion']:
                self.counts[bound + '_insertion_content'][nuke] += 1
        for nuke in info['seq']:
            self.counts['seq_content'][nuke] += 1

        self.mutefreqer.increment(info)

    # ----------------------------------------------------------------------------------------
    def __str__(self):
        return_str = []
        print 'hm I think I was too lazy to put \'all\' in this string'
        print '  or [vdj]_insertion_content or seq_content'
        for column in self.counts:
            return_str.append('%s\n' % column)
            return_str.append('%20s' % column)
            for dep in utils.column_dependencies[column]:
                return_str.append('%20s' % dep)
            return_str.append('\n')
            for index, count in self.counts[column].iteritems():
                for val in index:
                    return_str.append('%20s' % str(val))
                return_str.append('   %d / %d = %f\n' % (count, self.total, float(count) / self.total))
        return ''.join(return_str)

    # ----------------------------------------------------------------------------------------
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None):
        print '  plotting parameters'
        start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(['./bin/permissify-www', thisplotdir])  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True)

        self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up

        print '    parameter plot time: %.3f' % (time.time()-start)

    # ----------------------------------------------------------------------------------------
    def write(self, base_outdir):
        print '  writing parameters'
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        mute_start = time.time()
        self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files) 
        print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '    parameter write time: %.3f' % (time.time()-start)