Example #1
0
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in ['all',] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype]['probs'] = [], []
            mutehist = plotting.make_hist_from_bin_entry_file(infname, mtype+'-mute-freqs')
            self.branch_lengths[mtype]['mean'] = mutehist.GetMean()

            if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(mutehist.GetNbinsX()+1) > 0.0:
                print 'WARNING nonzero under/overflow bins read from %s' % infname

            check_sum = 0.0
            for ibin in range(1, mutehist.GetNbinsX()+1):  # ignore under/overflow bins
                freq = mutehist.GetBinCenter(ibin)
                branch_length = float(freq)
                prob = mutehist.GetBinContent(ibin)
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            assert utils.is_normed(check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in ['all',] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (mtype, self.branch_lengths[mtype]['mean'], self.branch_lengths[mtype]['mean'] / self.branch_lengths['all']['mean'])
    def read_mute_freqs(self, mute_freq_dir):
        # NOTE these are mute freqs, not branch lengths, but it's ok for now
        for mtype in [
                'all',
        ] + utils.regions:
            infname = mute_freq_dir + '/' + mtype + '-mean-mute-freqs.csv'
            self.branch_lengths[mtype] = {}
            self.branch_lengths[mtype]['lengths'], self.branch_lengths[mtype][
                'probs'] = [], []
            mutehist = plotting.make_hist_from_bin_entry_file(
                infname, mtype + '-mute-freqs')
            self.branch_lengths[mtype]['mean'] = mutehist.GetMean()

            if mutehist.GetBinContent(0) > 0.0 or mutehist.GetBinContent(
                    mutehist.GetNbinsX() + 1) > 0.0:
                print 'WARNING nonzero under/overflow bins read from %s' % infname

            check_sum = 0.0
            for ibin in range(1,
                              mutehist.GetNbinsX() +
                              1):  # ignore under/overflow bins
                freq = mutehist.GetBinCenter(ibin)
                branch_length = float(freq)
                prob = mutehist.GetBinContent(ibin)
                self.branch_lengths[mtype]['lengths'].append(branch_length)
                self.branch_lengths[mtype]['probs'].append(prob)
                check_sum += self.branch_lengths[mtype]['probs'][-1]
            assert utils.is_normed(check_sum)

        if self.args.debug:
            print '  mean branch lengths'
            for mtype in [
                    'all',
            ] + utils.regions:
                print '     %4s %7.3f (ratio %7.3f)' % (
                    mtype, self.branch_lengths[mtype]['mean'],
                    self.branch_lengths[mtype]['mean'] /
                    self.branch_lengths['all']['mean'])
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = germline_seqs  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = cyst_positions
        self.tryp_positions = tryp_positions

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()