Example #1
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg'))
            utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            hist = TH1D('hist_' + utils.sanitize_name(gene), '',
                        sorted_positions[-1] - sorted_positions[0] + 1,
                        sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5)
            for position in sorted_positions:
                hist.SetBinContent(hist.FindBin(position), counts[position]['freq'])
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                hist.SetBinError(hist.FindBin(position), err)
            plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg'
            xline = None
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
            plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e')  #, cwidth=4000, cheight=1000)
            paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)

        # for region in utils.regions:
        #     utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg'))
        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position))
        #         plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True)  #, cwidth=4000, cheight=1000)

        # make mean mute freq hists
        hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq')
        plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        for region in utils.regions:
            hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq')
            plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])

        # then write html file and fix permissiions
        for region in utils.regions:
            check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
            check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png'])
        check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None):
        print '  plotting parameters'
        start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(['./bin/permissify-www', thisplotdir])  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True)

        self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up

        print '    parameter plot time: %.3f' % (time.time()-start)
Example #3
0
    def write(self, base_outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        outdir = base_outdir + '/mute-freqs'
        utils.prep_dir(outdir, '*.csv')

        for gene in self.counts:
            counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with opener('w')(outfname) as outfile:
                nuke_header = []
                for nuke in utils.nukes:
                    nuke_header.append(nuke)
                    nuke_header.append(nuke + '_lo_err')
                    nuke_header.append(nuke + '_hi_err')
                writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header))
                writer.writeheader()
                for position in sorted_positions:
                    row = {'position':position,
                           'mute_freq':counts[position]['freq'],
                           'lo_err':counts[position]['freq_lo_err'],
                           'hi_err':counts[position]['freq_hi_err']}
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
Example #4
0
    def write(self, outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        for gene in self.counts:
            gcounts, freqs = self.counts[gene], self.freqs[gene]
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with opener('w')(outfname) as outfile:
                nuke_header = [n + xtra for n in utils.nukes for xtra in ('', '_obs', '_lo_err', '_hi_err')]
                writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header))
                writer.writeheader()
                for position in sorted(gcounts.keys()):
                    row = {'position':position,
                           'mute_freq':freqs[position]['freq'],
                           'lo_err':freqs[position]['freq_lo_err'],
                           'hi_err':freqs[position]['freq_hi_err']}
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_obs'] = gcounts[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
Example #5
0
    def plot(self, base_plotdir, only_csv=False):
        if not self.finalized:
            self.finalize(debug=debug)

        plotdir = base_plotdir + '/allele-finding'

        for old_gene_dir in glob.glob(plotdir + '/*'):  # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir
            if not os.path.isdir(old_gene_dir):
                raise Exception('not a directory: %s' % old_gene_dir)
            utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg'))
            os.rmdir(old_gene_dir)
        utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg'))

        if only_csv:  # not implemented
            return

        start = time.time()
        for gene in self.plotvals:
            if utils.get_region(gene) != 'v':
                continue

            for position in self.plotvals[gene]:
                if position not in self.fitted_positions[gene]:  # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow
                    continue
                # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None:
                #     continue
                plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position])
        print '      allele finding plot time: %.1f' % (time.time()-start)
Example #6
0
    def __init__(self, base_indir, outdir, gene_name, naivety, glfo, args):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = glfo['cyst-positions']
        self.tryp_positions = glfo['tryp-positions']

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.get_mean()
Example #7
0
    def finalize(self, calculate_uncertainty=True):
        """ convert from counts to mut freqs """
        assert not self.finalized

        self.n_cached, self.n_not_cached = 0, 0
        for gene in self.counts:
            self.freqs[gene], self.plotting_info[gene] = {}, []
            # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies
            counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            for position in sorted_positions:
                freqs[position] = {}
                plotting_info.append({})
                plotting_info[-1]['name'] = utils.sanitize_name(gene) + '_' + str(position)
                plotting_info[-1]['nuke_freqs'] = {}
                n_conserved, n_mutated = 0, 0
                for nuke in utils.nukes:
                    nuke_freq = float(counts[position][nuke]) / counts[position]['total']
                    freqs[position][nuke] = nuke_freq
                    plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq
                    if calculate_uncertainty:  # it's kinda slow
                        errs = fraction_uncertainty.err(counts[position][nuke], counts[position]['total'])
                        if errs[2]:
                            self.n_cached += 1
                        else:
                            self.n_not_cached += 1
                        # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')'
                        assert errs[0] <= nuke_freq  # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement
                        assert nuke_freq <= errs[1]
                        freqs[position][nuke + '_lo_err'] = errs[0]
                        freqs[position][nuke + '_hi_err'] = errs[1]

                    if nuke == counts[position]['gl_nuke']:
                        n_conserved += counts[position][nuke]
                    else:
                        n_mutated += counts[position][nuke]  # sum over A,C,G,T
                    # uncert = fraction_uncertainty.err(obs, total)  # uncertainty for each nuke
                counts[position]['freq'] = float(n_mutated) / counts[position]['total']
                mutated_fraction_err = (0.0, 0.0)
                if calculate_uncertainty:  # it's kinda slow
                    mutated_fraction_err = fraction_uncertainty.err(n_mutated, counts[position]['total'])
                    if mutated_fraction_err[2]:
                        self.n_cached += 1
                    else:
                        self.n_not_cached += 1
                counts[position]['freq_lo_err'] = mutated_fraction_err[0]
                counts[position]['freq_hi_err'] = mutated_fraction_err[1]

        self.mean_rates['all'].normalize(overflow_warn=False)  # we expect overflows in mute freq hists, so no need to warn us
        for region in utils.regions:
            self.mean_rates[region].normalize(overflow_warn=False)

        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False)

        self.finalized = True
Example #8
0
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq, args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry 

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, {'nukes':list(utils.nukes)})  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
Example #9
0
 def callback():
     name = utils.sanitize_name(transition.get_name_or_id())
     if phases == 2:
         self.sequence.add_fire(process_id, name)
     else:
         self.sequence.add_transition_start(process_id, name)
         if not transition.has_code():
             self.sequence.add_transition_finish(process_id)
     if query_reports:
         self.query_reports(ok_callback)
     elif ok_callback:
         ok_callback()
Example #10
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {r : glfo[c + '-positions'] for r, c in utils.conserved_codons[args.chain].items()}

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(self.indir, gene_name, debug=self.debug)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (self.n_occurences, self.args.min_observations_to_write)
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, debug=self.debug)

        self.erosion_probs = self.read_erosion_info(gene_name, replacement_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(gene_name, replacement_genes)
        self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, chain=self.args.chain, approved_genes=replacement_genes)  # actual info in <self.mute_obs> isn't actually used a.t.m.

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
Example #11
0
def read_mute_info(indir, this_gene, chain, approved_genes=None):  # NOTE this would probably be more accurate if we made some effort to align the genes before combining all the approved ones
    if approved_genes is None:
        approved_genes = [this_gene, ]
    if this_gene == glutils.dummy_d_genes[chain]:
        return {'overall_mean' : 0.5}, {}
    observed_freqs, observed_counts = {}, {}
    total_counts = 0
    # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with opener('r')(mutefname) as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(line['lo_err'])  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful
                if freq < utils.eps or abs(1.0 - freq) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)
                if pos not in observed_freqs:
                    observed_freqs[pos] = []
                    observed_counts[pos] = {n : 0 for n in utils.nukes}
                observed_freqs[pos].append({'freq':freq, 'err':max(abs(freq-lo_err), abs(freq-hi_err))})
                for nuke in utils.nukes:
                    observed_counts[pos][nuke] += int(line[nuke + '_obs'])
                total_counts += int(line[nuke + '_obs'])

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position
    mute_freqs = {}
    overall_total, overall_sum_of_weights = 0.0, 0.0  # also calculate the mean over all positions
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq
        overall_total += total
        overall_sum_of_weights += sum_of_weights

    mute_freqs['overall_mean'] = 0.
    if overall_sum_of_weights > 0.:
        mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights
    observed_counts['total_counts'] = total_counts
    return mute_freqs, observed_counts
Example #12
0
    def run_sequence(self, sequence):
        transitions = {}
        command = [0]
        for t in self.runinstance.net.transitions():
            transitions["#{0}".format(t.id)] = t
        for t in self.runinstance.net.transitions():
            transitions[utils.sanitize_name(t.get_name())] = t

        def next_command():
            if command[0] >= sequence.get_commands_size():
                self.query_reports()
                return
            sequence.execute_command(command[0], fire, start, finish, receive)
            command[0] += 1

        def fail_callback():
            self.emit_event("command-failed", sequence, command[0] - 1)

        def fire(process_id, transition):
            t = transitions.get(transition)
            if t is None:
                 raise SimulationException("Transition '{0}' not found".format(transition))
            self.fire_transition(t.id,
                                 process_id,
                                 2,
                                 ok_callback=next_command)

        def start(process_id, transition):
            t = transitions.get(transition)
            if t is None:
                 raise SimulationException("Transition '{0}' not found".format(transition))
            self.fire_transition(t.id,
                                 process_id,
                                 1,
                                 ok_callback=next_command)

        def finish(process_id):
            self.finish_transition(process_id,
                                   ok_callback=next_command,
                                   fail_callback=fail_callback)

        def receive(process_id, from_process):
            self.receive(process_id,
                         from_process,
                         ok_callback=next_command,
                         fail_callback=fail_callback)

        next_command()
Example #13
0
 def read_single_file(gtmp):
     mfname = indir + '/mute-freqs/' + utils.sanitize_name(gtmp) + '.csv'
     if not os.path.exists(mfname):
         return None
     observed_counts = {}
     with open(mfname, 'r') as mutefile:
         reader = csv.DictReader(mutefile)
         for line in reader:
             pos = int(line['position'])
             assert pos not in observed_counts
             observed_counts[pos] = {
                 n: int(line[n + '_obs'])
                 for n in utils.nukes
             }
     if debug:
         print '    read %d per-base mute counts from %s' % (
             len(observed_counts), mfname)
     return observed_counts
    def write(self, base_outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        outdir = base_outdir + '/mute-freqs'
        utils.prep_dir(outdir, '*.csv')

        for gene in self.counts:
            counts, freqs, plotting_info = self.counts[gene], self.freqs[
                gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with opener('w')(outfname) as outfile:
                nuke_header = []
                for nuke in utils.nukes:
                    nuke_header.append(nuke)
                    nuke_header.append(nuke + '_lo_err')
                    nuke_header.append(nuke + '_hi_err')
                writer = csv.DictWriter(
                    outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') +
                    tuple(nuke_header))
                writer.writeheader()
                for position in sorted_positions:
                    row = {
                        'position': position,
                        'mute_freq': counts[position]['freq'],
                        'lo_err': counts[position]['freq_lo_err'],
                        'hi_err': counts[position]['freq_hi_err']
                    }
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke +
                                                                '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke +
                                                                '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(
            mean_freq_outfname.replace(
                'REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(
                mean_freq_outfname.replace('REGION', region))
Example #15
0
    def check_hmm_existence(self, gene_list, skipped_gene_matches, parameter_dir, query_name, second_query_name=None):
        """ Check if hmm model file exists, and if not remove gene from <gene_list> and print a warning """
        # first get the list of genes for which we don't have hmm files
        if len(glob.glob(parameter_dir + '/hmms/*.yaml')) == 0:
            print 'ERROR no yamels in %s' % parameter_dir
            sys.exit()

        genes_to_remove = []
        for gene in gene_list:
            hmmfname = parameter_dir + '/hmms/' + utils.sanitize_name(gene) + '.yaml'
            if not os.path.exists(hmmfname):
                # if self.args.debug:
                #     print '    WARNING %s removed from match list for %s %s (not in %s)' % (utils.color_gene(gene), query_name, '' if second_query_name==None else second_query_name, os.path.dirname(hmmfname))
                skipped_gene_matches.add(gene)
                genes_to_remove.append(gene)

        # then remove 'em from <gene_list>
        for gene in genes_to_remove:
            gene_list.remove(gene)
Example #16
0
def read_mute_info(indir, this_gene, approved_genes=None):
    if approved_genes == None:
        approved_genes = [this_gene,]
    observed_freqs = {}
    # add an observation for each position, for each gene where we observed that position
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with opener('r')(mutefname) as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(line['lo_err'])  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful
                if freq < utils.eps or abs(1.0 - freq) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)
                if pos not in observed_freqs:
                    observed_freqs[pos] = []
                observed_freqs[pos].append({'freq':freq, 'err':max(abs(freq-lo_err), abs(freq-hi_err))})

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position
    mute_freqs = {}
    overall_total, overall_sum_of_weights = 0.0, 0.0  # also calculate the mean over all positions
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq
        overall_total += total
        overall_sum_of_weights += sum_of_weights

    mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights
    return mute_freqs
Example #17
0
    def write(self, outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        for gene in self.counts:
            gcounts, freqs = self.counts[gene], self.freqs[gene]
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with open(outfname, 'w') as outfile:
                nuke_header = [
                    n + xtra for n in utils.nukes
                    for xtra in ('', '_obs', '_lo_err', '_hi_err')
                ]
                writer = csv.DictWriter(
                    outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') +
                    tuple(nuke_header))
                writer.writeheader()
                for position in sorted(gcounts.keys()):
                    row = {
                        'position': position,
                        'mute_freq': freqs[position]['freq'],
                        'lo_err': freqs[position]['freq_lo_err'],
                        'hi_err': freqs[position]['freq_hi_err']
                    }
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_obs'] = gcounts[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke +
                                                                '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke +
                                                                '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(
            mean_freq_outfname.replace(
                'REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(
                mean_freq_outfname.replace('REGION', region))
Example #18
0
    def __init__(self, base_indir, outdir, gene_name, glfo, args, debug=False):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = glfo['seqs']  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][
            gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.debug = debug
        self.codon_positions = {
            r: glfo[c + '-positions']
            for r, c in utils.conserved_codons[args.locus].items()
        }

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = args.min_observations_to_write
        self.min_mean_unphysical_insertion_length = {
            'fv': 1.5,
            'jf': 25
        }  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths
        self.mute_freq_bounds = {
            'lo': 0.01,
            'hi': 0.5
        }  # don't let any position mutate less frequently than 1% of the time, or more frequently than half the time
        self.enforced_flat_mfreq_length = {  # i.e. distance over which the mute freqs are typically screwed up. I'm not really sure why these vary so much, but it's probably to do with how the s-w step works
            'v_3p' : 9,
            'd_5p' : 9,
            'd_3p' : 9,
            'j_5p' : 20,
        }

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        self.outdir = outdir
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        self.insertions = []
        if self.region == 'v':
            self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            self.insertions.append('jf')

        assert len(utils.ambiguous_bases) == 1 and utils.ambiguous_bases[
            0] == 'N'  # maybe need to update some stuff below if this changes

        if self.debug:
            print '%s' % utils.color_gene(gene_name)

        self.n_occurences = utils.read_single_gene_count(
            self.indir, gene_name, debug=self.debug
        )  # how many times did we observe this gene in data?
        approved_genes = [gene_name]
        # NOTE this never happens any more, since partitiondriver.cache_parameters() resets <args.min_observations_to_write> if it's arger than 10*(number of sequences)
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average also over all the genes that find_replacement_genes() gives us
            if self.debug:
                print '      only saw it %d times (wanted %d), so use info from all other genes' % (
                    self.n_occurences, self.args.min_observations_to_write)
            approved_genes += utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                debug=self.debug)

        self.erosion_probs = self.read_erosion_info(approved_genes)
        self.insertion_probs, self.insertion_content_probs = self.read_insertion_info(
            approved_genes)
        self.mute_freqs = paramutils.read_mute_freqs_with_weights(
            self.indir, approved_genes)  # weighted averages over genes
        self.mute_counts = paramutils.read_mute_counts(
            self.indir, gene_name, self.args.locus)  # raw per-{ACGT} counts
        self.process_mutation_info(
        )  # smooth/interpolation/whatnot for <self.mute_freqs> and <self.mute_counts>
        # NOTE i'm using a hybrid approach with mute_freqs and mute_counts -- the only thing I get from mute_counts is the ratios of the different bases, whereas the actual freq comes from mute_freqs (which has all the corrections/smooth/bullshit)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, self.track.getdict()
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
        tmp_mean_freq_hist = Hist(fname=self.indir +
                                  '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = tmp_mean_freq_hist.get_mean()
        self.hmm.extras['per_gene_mute_freq'] = self.mute_freqs[
            'unweighted_overall_mean']  # the other (weighted) one might be technically more accurate, depending on what you want, but it's probably not what anyone is expecting, so we write the unweighted one
Example #19
0
def join_gene_names(gene_name_str):
    return ':'.join([utils.sanitize_name(g) for g in gene_name_str.split(':')])
    def write_freqs(self, baseplotdir, baseoutdir, total_frequency=True, only_gene_name='', calculate_uncertainty=True):
        cvn = TCanvas("cvn", "", 1700, 600)
        for gene_name in self.freqs:
            if only_gene_name != '' and gene_name != only_gene_name:
                continue
            print '  %-20s' % (gene_name)
            mute_freqs = self.freqs[gene_name]
            sorted_positions = sorted(mute_freqs)

            # calculate mute freq and its uncertainty
            for position in sorted_positions:
                n_conserved, n_mutated = 0, 0
                total = mute_freqs[position]['n_reads']
                for nuke in utils.nukes:
                    obs = int(round(mute_freqs[position][nuke] * total))
                    if nuke == mute_freqs[position]['ref']:
                        n_conserved += obs
                    else:
                        n_mutated += obs
                    # uncert = fraction_uncertainty(obs, total)  # uncertainty for each nuke
                assert n_mutated + n_conserved == total
                mute_freqs[position]['mute_freq'] = float(n_mutated) / total
                mutated_fraction_err = (0.0, 0.0)
                if calculate_uncertainty:  # it's kinda slow
                    mutated_fraction_err = fraction_uncertainty(n_mutated, total)
                mute_freqs[position]['mute_freq_lo_err'] = mutated_fraction_err[0]
                mute_freqs[position]['mute_freq_hi_err'] = mutated_fraction_err[1]

            # write to csv
            outdir = baseoutdir + '/' + self.human + '/' + self.naivety + '/mute-freqs'
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            outfname = outdir +  '/' + utils.sanitize_name(gene_name) + '.csv'
            # TODO there's kind of starting to be a lot of differenct scripts producing inputs for recombinator. I should unify them
            with opener('w')(outfname) as outfile:  # write out mutation freqs for use by recombinator
                outfile.write('position,mute_freq,lo_err,hi_err\n')
                for position in sorted_positions:
                    outfile.write('%d,%f,%f,%f\n' % (position, mute_freqs[position]['mute_freq'], mute_freqs[position]['mute_freq_lo_err'],mute_freqs[position]['mute_freq_hi_err']))
                
            # and make a plot
            hist = TH1F('hist_' + utils.sanitize_name(gene_name), '',
                        sorted_positions[-1] - sorted_positions[0] + 1,
                        sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5)
            lo_err_hist = TH1F(hist)
            hi_err_hist = TH1F(hist)
            for position in sorted_positions:
                hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq'])
                lo_err_hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq_lo_err'])
                hi_err_hist.SetBinContent(hist.FindBin(position), mute_freqs[position]['mute_freq_hi_err'])
            hframe = TH1F(hist)
            hframe.SetTitle(gene_name + ';;')
            hframe.Reset()
            hframe.SetMinimum(lo_err_hist.GetMinimum() - 0.03)
            hframe.SetMaximum(1.1*hi_err_hist.GetMaximum())
            hframe.Draw('')
            line = TLine(hist.GetXaxis().GetXmin(), 0., hist.GetXaxis().GetXmax(), 0.)
            line.SetLineColor(0)
            line.Draw()  # can't figure out how to convince hframe not to draw a horizontal line at y=0, so... cover it up
            hist.SetLineColor(419)
            hist.Draw('same')
            lo_err_hist.SetLineColor(kRed+2)
            hi_err_hist.SetLineColor(kRed+2)
            lo_err_hist.SetMarkerColor(kRed+2)
            hi_err_hist.SetMarkerColor(kRed+2)
            lo_err_hist.SetMarkerStyle(22)
            hi_err_hist.SetMarkerStyle(23)
            lo_err_hist.Draw('p same')
            hi_err_hist.Draw('p same')
            plotdir = baseplotdir + '/' + self.human + '/' + self.naivety + '/plots'
            if not os.path.exists(plotdir):
                os.makedirs(plotdir)
            outfname = plotdir + '/' + utils.sanitize_name(gene_name) + '.png'
            cvn.SaveAs(outfname)
Example #21
0
    # plt.xlabel(legends.get(meth2, meth2) + ' cluster size')  # I don't know why it's reversed, it just is
    # plt.ylabel(legends.get(meth1, meth1) + ' cluster size')
    # ax.set_xlim(0, n_biggest_clusters)
    # ax.set_ylim(0, n_biggest_clusters)

    plt.title(title)

    if not os.path.exists(plotdir + '/plots'):
        os.makedirs(plotdir + '/plots')
    plt.savefig(plotdir + '/plots/' + plotname + '.svg')
    plt.close()

# ----------------------------------------------------------------------------------------
baseplotdir = os.getenv('www') + '/tmp'
for difftype in ['indels', 'subs']:
    print difftype
    # individual primary version plots
    for pv in pversions:
        print '   ', pv
        plotheatmap(baseplotdir + '/' + difftype, utils.sanitize_name(pv), difftype, genelist=pversions[pv], title='primary version \"' + pv + '\"', xtitle=xtitles[difftype])

    # plots comparing two different primary versions
    plotheatmap(baseplotdir + '/' + difftype,
                'compare-pvs',
                difftype,
                genesets=pversions, title='compare means over pairs of primary versions', xtitle=xtitles[difftype])

    check_call(['./bin/makeHtml', baseplotdir + '/' + difftype, '2', 'foop', 'svg'])

check_call(['./bin/permissify-www', baseplotdir])
Example #22
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        overall_plotdir = plotdir + '/overall'
        utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg'))
            # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme')  #, title=utils.sanitize_name(gene))
            for position in sorted_positions:
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err)
            xline = None
            figsize = [3, 3]
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = tryp_positions[gene]
                figsize[0] *= 2
            plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv)
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl

        # make mean mute freq hists
        plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)
        for region in utils.regions:
            plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)

        if not only_csv:  # write html file and fix permissiions
            plotting.make_html(overall_plotdir)
            for region in utils.regions:
                plotting.make_html(plotdir + '/' + region, n_columns=1)
Example #23
0
    def plot(
        self,
        plotdir,
        only_csv=False,
        only_overall=False,
        make_per_base_plots=False
    ):  # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it write() wasn't called first, that is)
        import plotting
        print '  plotting parameters in %s' % plotdir,
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + '/mute-freqs',
                          only_csv=only_csv,
                          only_overall=only_overall,
                          make_per_base_plots=make_per_base_plots)

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = 'string' if column in self.string_columns else 'int'

            hist = hutils.make_hist_from_dict_of_counts(
                values, var_type, column)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv,
                stats='mean' if column in self.mean_columns else None,
                normalize=True)

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = hutils.make_hist_from_dict_of_counts(
                        gene_values[gene], var_type, plotname)
                    plotting.draw_no_root(hist,
                                          plotname=plotname,
                                          plotdir=thisplotdir,
                                          xtitle=plotconfig.plot_titles.get(
                                              column, column),
                                          plottitle=gene,
                                          errors=True,
                                          write_csv=True,
                                          only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time() - start)
 def clean(self):
     """ remove all the parameter files """
     for gene in self.counts:
         outfname = self.outdir + '/' + utils.sanitize_name(gene) + '.csv'
         os.remove(outfname)
     os.rmdir(self.outdir)
Example #25
0
def convert_model(s1_model, s2fm_addon_folder):
    print(f'\033[94mWorking on {s1_model.stem} model\033[0m')
    s1_mdl = Mdl(s1_model)
    s1_mdl.read()
    eye_conv = EyeConverter()

    content_manager = ContentManager()
    content_manager.scan_for_content(s1_model)

    mod_path = get_mod_path(s1_model)
    rel_model_path = normalize_path(s1_model.relative_to(mod_path))
    print('\033[94mCollecting materials\033[0m')
    s1_materials = collect_materials(s1_mdl)

    os.makedirs(s2fm_addon_folder / rel_model_path.with_suffix(''),
                exist_ok=True)

    eyes = eye_conv.process_mdl(
        s1_mdl, s2fm_addon_folder / rel_model_path.with_suffix(''))

    print('\033[94mDecompiling model\033[0m')
    model_decompiler = ModelDecompiler(s1_model)
    model_decompiler.decompile(remove_eyes=True)
    model_decompiler.save(s2fm_addon_folder / rel_model_path.with_suffix(''))
    s2_vmodel = (s2fm_addon_folder / rel_model_path.with_suffix('.vmdl'))
    os.makedirs(s2_vmodel.parent, exist_ok=True)

    print('\033[94mWriting VMDL\033[0m')
    vmdl = KV3mdl()
    for dmx_model in model_decompiler.dmx_models:
        vmdl.add_render_mesh(
            sanitize_name(dmx_model.mdl_model.name),
            normalize_path(
                rel_model_path.with_suffix('') /
                f'{Path(dmx_model.mdl_model.name).stem}.dmx'))

    for eyeball_name, eyeball_path in eyes:
        vmdl.add_render_mesh(
            sanitize_name(eyeball_name),
            normalize_path(eyeball_path.relative_to(s2fm_addon_folder)))

    for bone in s1_mdl.bones:
        if bone.procedural_rule_type == ProceduralBoneType.JIGGLE:
            procedural_rule = bone.procedural_rule  # type:JiggleRule
            jiggle_type = 0
            if procedural_rule.flags & JiggleRuleFlags.IS_RIGID:
                jiggle_type = 0
            elif procedural_rule.flags & JiggleRuleFlags.IS_FLEXIBLE:
                jiggle_type = 1
            elif procedural_rule.flags & JiggleRuleFlags.HAS_BASE_SPRING:
                jiggle_type = 2

            jiggle_data = {
                "name":
                f"{bone.name}_jiggle",
                "jiggle_root_bone":
                bone.name,
                "jiggle_type":
                jiggle_type,
                'length':
                procedural_rule.length,
                'tip_mass':
                procedural_rule.tip_mass,
                'has_yaw_constraint':
                bool(procedural_rule.flags
                     & JiggleRuleFlags.HAS_YAW_CONSTRAINT),
                'has_pitch_constraint':
                bool(procedural_rule.flags
                     & JiggleRuleFlags.HAS_PITCH_CONSTRAINT),
                'has_angle_constraint':
                bool(procedural_rule.flags
                     & JiggleRuleFlags.HAS_ANGLE_CONSTRAINT),
                'allow_flex_length  ':
                bool(procedural_rule.flags
                     & JiggleRuleFlags.HAS_LENGTH_CONSTRAINT),
                'invert_axes':
                bone.position[0] < 0,
                'angle_limit':
                math.degrees(procedural_rule.angle_limit),
                'max_yaw':
                procedural_rule.max_yaw,
                'min_yaw':
                procedural_rule.min_yaw,
                'yaw_bounce':
                procedural_rule.yaw_bounce,
                'yaw_damping':
                procedural_rule.yaw_damping or 10,
                'yaw_stiffness':
                procedural_rule.yaw_stiffness or 10,
                'yaw_friction':
                procedural_rule.yaw_friction or 10,
                'max_pitch':
                procedural_rule.max_pitch,
                'min_pitch':
                procedural_rule.min_pitch,
                'pitch_bounce':
                procedural_rule.pitch_bounce or 10,
                'pitch_damping':
                procedural_rule.pitch_damping or 10,
                'pitch_stiffness':
                procedural_rule.pitch_stiffness or 10,
                'pitch_friction':
                procedural_rule.pitch_friction or 10,
                'base_left_max':
                procedural_rule.base_max_left,
                'base_left_min':
                procedural_rule.base_min_left,
                'base_left_friction':
                procedural_rule.base_left_friction,
                'base_up_max':
                procedural_rule.base_max_up,
                'base_up_min':
                procedural_rule.base_min_up,
                'base_up_friction':
                procedural_rule.base_up_friction,
                'base_forward_max':
                procedural_rule.base_min_forward,
                'base_forward_min':
                procedural_rule.base_min_forward,
                'base_forward_friction':
                procedural_rule.base_forward_friction,
                'along_stiffness':
                procedural_rule.along_stiffness / 10,
                'along_damping':
                procedural_rule.along_damping or 15,
            }
            vmdl.add_jiggle_bone(jiggle_data)

    for s1_bodygroup in s1_mdl.body_parts:
        if 'clamped' in s1_bodygroup.name:
            continue
        bodygroup = vmdl.add_bodygroup(sanitize_name(s1_bodygroup.name))
        for mesh in s1_bodygroup.models:
            if len(mesh.meshes) == 0 or mesh.name == 'blank':
                vmdl.add_bodygroup_choice(bodygroup, [])
                continue
            vmdl.add_bodygroup_choice(bodygroup, sanitize_name(mesh.name))
    reference_skin = s1_mdl.skin_groups[0]

    for n, skin in enumerate(s1_mdl.skin_groups[1:]):
        vmdl_skin = vmdl.add_skin(f'skin_{n}')
        for ref_mat, skin_mat in zip(reference_skin, skin):
            if ref_mat != skin_mat:
                ref_mat = get_s2_material_path(normalize_path(ref_mat),
                                               s1_materials)
                skin_mat = get_s2_material_path(normalize_path(skin_mat),
                                                s1_materials)
                if ref_mat and skin_mat:
                    vmdl.add_skin_remap(vmdl_skin, ref_mat, skin_mat)
                else:
                    print(
                        '\033[91mFailed to create skin!\nMissing source or destination material!\033[0m'
                    )

    with s2_vmodel.open('w') as f:
        f.write(vmdl.dump())

    print('\033[94mConverting materials\033[0m')
    for mat in s1_materials:
        mat_name = normalize_path(mat[0])
        print('\033[92mConverting {}\033[0m'.format(mat_name))
        result, shader = convert_material(mat, s2fm_addon_folder)
        if result:
            pass
        else:
            print(f'\033[91mUnsupported Source1 shader "{shader}"!\033[0m')
    return s2_vmodel
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seq,
                 args):
        self.indir = base_indir
        self.args = args

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        self.allow_unphysical_insertions = self.args.allow_unphysical_insertions  # allow fv and jf insertions. NOTE this slows things down by a factor of 6 or so
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries

        self.v_3p_del_pseudocount_limit = 10  # add at least one entry

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.region = utils.get_region(gene_name)
        self.naivety = naivety
        self.germline_seq = germline_seq
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if self.allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if self.allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(
            self.indir, only_gene=gene_name, normalize=False
        )  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(
                self.indir,
                self.args.min_observations_to_write,
                gene_name,
                single_gene=False,
                debug=self.args.debug)

        self.read_erosion_info(
            gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs = paramutils.read_mute_info(
                self.indir,
                this_gene=gene_name,
                approved_genes=replacement_genes)

        self.track = Track('nukes', list(utils.nukes))
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(
            self.saniname, {'nukes': list(utils.nukes)}
        )  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(
            self.eps,
            utils.read_overall_gene_probs(self.indir, only_gene=gene_name)
        )  # if we really didn't see this gene at all, take pity on it and kick it an eps
Example #27
0
 def clean(self):
     """ remove all the parameter files """
     for gene in self.counts:
         outfname = self.outdir + '/' + utils.sanitize_name(gene) + '.csv'
         os.remove(outfname)
     os.rmdir(self.outdir)
Example #28
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        print "  plotting parameters",
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir)

        self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall)

        overall_plotdir = plotdir + "/overall"

        for column in self.counts:
            if column == "all":
                continue
            values, gene_values = {}, {}
            for index, count in self.counts[column].iteritems():
                column_val = index[0]

                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                if column in self.columns_to_subset_by_gene:
                    gene = index[
                        1
                    ]  # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it
                    utils.split_gene(gene)  # checks validity of gene
                    if gene not in gene_values:
                        gene_values[gene] = {}
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count

            var_type = "string" if column in self.string_columns else "int"

            hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True)
            plotting.draw_no_root(
                hist,
                plotname=column,
                plotdir=overall_plotdir,
                xtitle=plotconfig.xtitles.get(column, column),
                plottitle=plotconfig.plot_titles.get(column, column),
                errors=True,
                write_csv=True,
                only_csv=only_csv,
            )

            if column in self.columns_to_subset_by_gene and not only_overall:
                thisplotdir = plotdir + "/" + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + "-" + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(
                        hist,
                        plotname=plotname,
                        plotdir=thisplotdir,
                        xtitle=plotconfig.plot_titles.get(column, column),
                        plottitle=gene,
                        errors=True,
                        write_csv=True,
                        only_csv=only_csv,
                    )
                if not only_csv:
                    plotting.make_html(thisplotdir)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print "(%.1f sec)" % (time.time() - start)
    def __init__(self, base_indir, outdir, gene_name, naivety, germline_seqs, args, cyst_positions, tryp_positions):
        self.region = utils.get_region(gene_name)
        self.raw_name = gene_name  # i.e. unsanitized
        self.germline_seqs = germline_seqs  # all germline alleles
        self.germline_seq = self.germline_seqs[self.region][gene_name]  # germline sequence for this hmm
        self.indir = base_indir
        self.args = args
        self.cyst_positions = cyst_positions
        self.tryp_positions = tryp_positions

        # parameters with values that I more or less made up
        self.precision = '16'  # number of digits after the decimal for probabilities
        self.eps = 1e-6  # NOTE I also have an eps defined in utils, and they should in principle be combined
        self.n_max_to_interpolate = 20
        # self.allow_external_deletions = args.allow_external_deletions       # allow v left and j right deletions. I.e. if your reads extend beyond v or j boundaries
        self.min_mean_unphysical_insertion_length = {'fv' : 1.5, 'jf' : 25}  # jf has to be quite a bit bigger, since besides account for the variation in J length from the tryp position to the end, it has to account for the difference in cdr3 lengths

        self.erosion_pseudocount_length = 10  # if we're closer to the end of the gene than this, make sure erosion probability isn't zero

        # self.insert_mute_prob = 0.0
        # self.mean_mute_freq = 0.0

        self.outdir = outdir
        self.naivety = naivety
        self.smallest_entry_index = -1  # keeps track of the first state that has a chance of being entered from init -- we want to start writing (with add_internal_state) from there

        # self.insertions = [ insert for insert in utils.index_keys if re.match(self.region + '._insertion', insert) or re.match('.' + self.region + '_insertion', insert)]  OOPS that's not what I want to do
        self.insertions = []
        if self.region == 'v':
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('fv')
        elif self.region == 'd':
            self.insertions.append('vd')
        elif self.region == 'j':
            self.insertions.append('dj')
            if not self.args.dont_allow_unphysical_insertions:
                self.insertions.append('jf')

        self.erosion_probs = {}
        self.insertion_probs = {}
        self.insertion_content_probs = {}

        self.n_occurences = utils.read_overall_gene_probs(self.indir, only_gene=gene_name, normalize=False)  # how many times did we observe this gene in data?
        replacement_genes = None
        if self.n_occurences < self.args.min_observations_to_write:  # if we didn't see it enough, average over all the genes that find_replacement_genes() gives us
            if self.args.debug:
                print '    only saw it %d times, use info from other genes' % self.n_occurences
            replacement_genes = utils.find_replacement_genes(self.indir, self.args.min_observations_to_write, gene_name, single_gene=False, debug=self.args.debug)

        self.read_erosion_info(gene_name, replacement_genes)  # try this exact gene, but...

        self.read_insertion_info(gene_name, replacement_genes)

        if self.naivety == 'M':  # mutate if not naive
            self.mute_freqs, self.mute_obs = paramutils.read_mute_info(self.indir, this_gene=gene_name, approved_genes=replacement_genes)

        self.track = Track('nukes', utils.nukes)
        self.saniname = utils.sanitize_name(gene_name)
        self.hmm = HMM(self.saniname, self.track.getdict())  # pass the track as a dict rather than a Track object to keep the yaml file a bit more readable
        self.hmm.extras['gene_prob'] = max(self.eps, utils.read_overall_gene_probs(self.indir, only_gene=gene_name))  # if we really didn't see this gene at all, take pity on it and kick it an eps
        mean_freq_hist = plotting.make_hist_from_bin_entry_file(self.indir + '/all-mean-mute-freqs.csv')
        self.hmm.extras['overall_mute_freq'] = mean_freq_hist.GetMean()
def scrape_book_data(driver,
                     book_url,
                     match_language="",
                     category={"label": "Uncategorized"},
                     force=False):
    # check if this book has already been dumped, unless we are forcing
    # scraping, if so return the content of the dump, alonside with a flash
    # saying it already existed
    if os.path.exists(get_book_dump_filename(book_url)) and not force:
        log.debug(f"Json dump for book {book_url} already exists, skipping "
                  "scraping...")
        with open(get_book_dump_filename(book_url)) as f:
            return json.load(f), True

    # if not, proceed scraping the reader page
    log.info(f"Scraping book at {book_url}")
    if "/nc/reader/" not in book_url:
        book_url = book_url.replace("/books/", "/nc/reader/")

    if not driver.current_url == book_url:
        driver.get(book_url)

    # check for re-direct to the upgrade page
    detect_needs_upgrade(driver)

    reader = driver.find_element_by_class_name("reader__container")

    # get the book's metadata from the blinkist API using its ID
    book_id = reader.get_attribute("data-book-id")
    book_json = requests.get(
        url=f"https://api.blinkist.com/v4/books/{book_id}").json()
    book = book_json["book"]

    if match_language and book["language"] != match_language:
        log.warning(
            f"Book not available in the selected language ({match_language}), "
            "skipping scraping...")
        return None, False

    # sanitize the book's title and author since they will be used for paths
    # and such
    book["title"] = sanitize_name(book["title"])
    book["author"] = sanitize_name(book["author"])

    # check if the book's metadata already has chapter content
    # (this is the case for the free book of the day)
    json_needs_content = False
    for chapter_json in book["chapters"]:
        if "text" not in chapter_json:
            json_needs_content = True
            break
        else:
            # change the text content key name for compatibility with the
            # script methods
            chapter_json["content"] = chapter_json.pop("text")

    if json_needs_content:
        # scrape the chapter's content on the reader page
        # and extend the book json data by inserting the scraped content
        # in the appropriate chapter section to get a complete data file
        book_chapters = driver.find_elements(By.CSS_SELECTOR,
                                             ".chapter.chapter")
        for chapter in book_chapters:
            chapter_no = chapter.get_attribute("data-chapterno")
            chapter_content = chapter.find_element_by_class_name(
                "chapter__content")
            for chapter_json in book["chapters"]:
                if chapter_json["order_no"] == int(chapter_no):
                    chapter_json["content"] = chapter_content.get_attribute(
                        "innerHTML")
                    break

        # look for any supplement sections
        book_supplements = driver.find_elements(By.CSS_SELECTOR,
                                                ".chapter.supplement")
        for supplement in book_supplements:
            chapter_no = supplement.get_attribute("data-chapterno")
            supplement_content = chapter.find_element_by_class_name(
                "chapter__content")
            for chapter_json in book["chapters"]:
                if chapter_json["order_no"] == int(chapter_no):
                    if not chapter_json.get("supplement", None):
                        supplement_text = supplement_content.get_attribute(
                            "innerHTML")
                        chapter_json["supplement"] = supplement_text
                    break

    # if we are scraping by category, add it to the book metadata
    book["category"] = category["label"]

    # store the book json metadata for future use
    dump_book(book)

    # return a tuple with the book json metadata, and a boolean indicating
    # whether the json dump already existed or not
    return book, False
Example #31
0
    germlines = utils.read_germlines('../../../recombinator')
    reader = csv.DictReader(infile)
    for inline in reader:
        print 'searching'
#        inline['seq'] = inline['seq'][-130:]
        searcher = Searcher(inline['seq'], debug=True, n_matches_max=2)
        searcher.search()
        inferred_group_str = ''
        true_group_str = ''
        outline = {}
        outline['seq'] = inline['seq']
        print 'RESULT ',
        for region in utils.regions:
            inferred_name = searcher.get_best_match_name(region)
            outline[region + '_gene'] = utils.unsanitize_name(inferred_name)
            true_name = utils.sanitize_name(inline[region + '_gene'])

            inferred_group_str += inferred_name
            true_group_str += true_name
            if inferred_name == 'none':
                print ' none',
            elif  inferred_name == true_name:
                print '  -  ',
            else:
                print '  x  ',
        for region in utils.regions:
            print '%3d' % searcher.n_tries[region],
        print ''
        print '  true'
        utils.print_reco_event(germlines, inline, -1, -1)
        if searcher.all_matched():
Example #32
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        if not self.finalized:
            self.finalize()

        overall_plotdir = plotdir + '/overall'

        for gene in self.freqs:
            if only_overall:
                continue
            freqs = self.freqs[gene]
            if len(freqs) == 0:
                if gene not in glutils.dummy_d_genes.values():
                    print '    %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene))
                continue
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene)
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err)
            xline = None
            figsize = [7, 4]
            if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]:
                codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)]
                xline = self.glfo[codon + '-positions'][gene]
            if utils.get_region(gene) == 'v':
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j':
                figsize[0] *= 2
            plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True)
            # per-position plots:
            plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True)
            # # per-position, per-base plots:
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment

        # make mean mute freq hists
        for rstr in ['all', 'cdr3'] + utils.regions:
            if rstr == 'all':
                bounds = (0.0, 0.4)
            else:
                bounds = (0.0, 0.6 if rstr == 'd' else 0.4)
            plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True)
            plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr)
Example #33
0
    def plot(self, plotdir, only_csv=False, only_overall=False):
        import plotting
        if not self.finalized:
            self.finalize()

        overall_plotdir = plotdir + '/overall'

        for gene in self.freqs:
            if only_overall:
                continue
            freqs = self.freqs[gene]
            if len(freqs) == 0:
                if gene not in glutils.dummy_d_genes.values():
                    print '    %s no mutefreqer obs for %s' % (utils.color(
                        'red', 'warning'), utils.color_gene(gene))
                continue
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1,
                            sorted_positions[0] - 0.5,
                            sorted_positions[-1] + 0.5,
                            xtitle='position',
                            ytitle='mut freq',
                            title=gene)
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] -
                              freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] -
                              freqs[position]['freq_lo_err'])
                err = 0.5 * (hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position),
                                  freqs[position]['freq'],
                                  error=err)
            xline = None
            figsize = [7, 4]
            if utils.get_region(gene) in utils.conserved_codons[
                    self.glfo['locus']]:
                xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene)
            if utils.get_region(gene) == 'v':
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j':
                figsize[0] *= 2
            plotting.draw_no_root(self.per_gene_mean_rates[gene],
                                  plotdir=plotdir + '/per-gene/' +
                                  utils.get_region(gene),
                                  plotname=utils.sanitize_name(gene),
                                  errors=True,
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            # per-position plots:
            plotting.draw_no_root(genehist,
                                  plotdir=plotdir + '/per-gene-per-position/' +
                                  utils.get_region(gene),
                                  plotname=utils.sanitize_name(gene),
                                  errors=True,
                                  write_csv=True,
                                  xline=xline,
                                  figsize=figsize,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            # # per-position, per-base plots:
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment

        # make mean mute freq hists
        for rstr in ['all', 'cdr3'] + utils.regions:
            if rstr == 'all':
                bounds = (0.0, 0.4)
            else:
                bounds = (0.0, 0.6 if rstr == 'd' else 0.4)
            plotting.draw_no_root(self.mean_rates[rstr],
                                  plotname=rstr + '_mean-freq',
                                  plotdir=overall_plotdir,
                                  stats='mean',
                                  bounds=bounds,
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)
            plotting.draw_no_root(self.mean_n_muted[rstr],
                                  plotname=rstr + '_mean-n-muted',
                                  plotdir=overall_plotdir,
                                  stats='mean',
                                  write_csv=True,
                                  only_csv=only_csv,
                                  shift_overflows=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr)
Example #34
0
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False):
        print '  plotting parameters',
        sys.stdout.flush()
        start = time.time()

        self.clean_plots(plotdir, subset_by_gene)

        self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        overall_plotdir = plotdir + '/overall'

        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                raise Exception('no counts in %s' % column)
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv)
                if not only_csv:
                    plotting.make_html(thisplotdir)

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv)

        if not only_csv:
            plotting.make_html(overall_plotdir)

        print '(%.1f sec)' % (time.time()-start)
Example #35
0
    def _export_control_sequence(self, sequence):
        VERSION = "1.0"
        TYPE = "transition_id" if self.export_transition_id else "transition_name"

        if sequence:
            dialog = gtk.FileChooserDialog("Export Control Sequence",
                                           self.app.window,
                                           gtk.FILE_CHOOSER_ACTION_SAVE,
                                           (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL,
                                           gtk.STOCK_SAVE, gtk.RESPONSE_OK))
            dialog.set_default_response(gtk.RESPONSE_OK)
            dialog.set_current_name("{0}.kcs.xml".format(sequence.name))

            skcs_filter = gtk.FileFilter() # Kaira Control Sequence
            skcs_filter.set_name("Control Sequence (.kcs.xml)")
            dialog.add_filter(skcs_filter)

            try:
                response = dialog.run()
                filename = dialog.get_filename()
            finally:
                dialog.destroy()

            net = self.project.build_net

            transitions = {}
            for t in net.transitions():
                transitions["#{0}".format(t.id)] = t
            for t in net.transitions():
                transitions[utils.sanitize_name(t.get_name())] = t

            running_transitions = {}
            if response == gtk.RESPONSE_OK:
                cmdlines = "\n"
                for command in sequence.commands:
                    match = command_parser.match(command)
                    if match is None:
                        raise ControlSequenceException("Invalid format: ", command)

                    process = int(match.group("process"))
                    action = match.group("action")

                    if action == "T" or action == "S":
                        arg = match.group("arg_int")
                        if arg is None:
                            arg = match.group("arg_str")
                        if not transitions.has_key(arg):
                            raise ControlSequenceException(
                                    "Transition '{0}' not found.".format(arg))

                        t = transitions[arg]
                        if self.export_transition_id:
                            tid = t.id
                        else:
                            tid = utils.sanitize_name(t.get_name_or_id())
                        cmdlines += "{0} {1} {2}\n".format(process, action, tid)
                        if action == "S":
                            if running_transitions.has_key(process):
                                running_transitions[process].push(t.id)
                            else:
                                running_transitions[process] = [t.id]
                    elif action == "R":
                        arg_int = match.group("arg_int")
                        if arg_int is None:
                            raise ControlSequenceException("Invalid format of receive.")
                        cmdlines += "{0}\n".format(command)
                    else:
                        assert action == "F"
                        if not running_transitions.has_key(process) or \
                                not running_transitions[process]:
                            raise ControlSequenceException(
                                "Invalid sequence. Transition fire action is missing.")

                        tid = running_transitions[process].pop()
                        cmdlines += "{0} {1} {2}\n".format(process, action, tid)

                element = xml.Element("sequence")
                element.set("name", sequence.name)
                element.set("type", TYPE)
                element.set("version", VERSION)
                element.text = cmdlines

                tree = xml.ElementTree(element)
                tree.write(filename)
Example #36
0
    plt.xticks(ticks, xticklabels, rotation=90)
    plt.yticks(ticks, yticklabels)
    # plt.xlabel(legends.get(meth2, meth2) + ' cluster size')  # I don't know why it's reversed, it just is
    # plt.ylabel(legends.get(meth1, meth1) + ' cluster size')
    # ax.set_xlim(0, n_biggest_clusters)
    # ax.set_ylim(0, n_biggest_clusters)

    plt.title(title)

    if not os.path.exists(plotdir + '/plots'):
        os.makedirs(plotdir + '/plots')
    plt.savefig(plotdir + '/plots/' + plotname + '.svg')
    plt.close()

# ----------------------------------------------------------------------------------------
baseplotdir = os.getenv('www') + '/tmp'
for difftype in ['indels', 'subs']:
    print difftype
    # individual primary version plots
    for pv in pversions:
        print '   ', pv
        plotheatmap(baseplotdir + '/' + difftype, utils.sanitize_name(pv), difftype, genelist=pversions[pv], title='primary version \"' + pv + '\"', xtitle=xtitles[difftype])

    # plots comparing two different primary versions
    plotheatmap(baseplotdir + '/' + difftype,
                'compare-pvs',
                difftype,
                genesets=pversions, title='compare means over pairs of primary versions', xtitle=xtitles[difftype])

    check_call(['./bin/makeHtml', baseplotdir + '/' + difftype, '2', 'foop', 'svg'])
Example #37
0
def join_gene_names(gene_name_str):
    return ':'.join([utils.sanitize_name(g) for g in gene_name_str.split(':')])
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots',
                           multilings=('*.csv', '*.svg'))
            utils.prep_dir(plotdir + '/' + region + '-per-base/plots',
                           multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            hist = TH1D('hist_' + utils.sanitize_name(gene), '',
                        sorted_positions[-1] - sorted_positions[0] + 1,
                        sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5)
            for position in sorted_positions:
                hist.SetBinContent(hist.FindBin(position),
                                   counts[position]['freq'])
                hi_diff = abs(counts[position]['freq'] -
                              counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] -
                              counts[position]['freq_lo_err'])
                err = 0.5 * (hi_diff + lo_diff)
                hist.SetBinError(hist.FindBin(position), err)
            plotfname = plotdir + '/' + utils.get_region(
                gene) + '/plots/' + utils.sanitize_name(gene) + '.svg'
            xline = None
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
            plotting.draw(hist,
                          'int',
                          plotdir=plotdir + '/' + utils.get_region(gene),
                          plotname=utils.sanitize_name(gene),
                          errors=True,
                          write_csv=True,
                          xline=xline,
                          draw_str='e')  #, cwidth=4000, cheight=1000)
            paramutils.make_mutefreq_plot(
                plotdir + '/' + utils.get_region(gene) + '-per-base',
                utils.sanitize_name(gene), plotting_info)

        # make mean mute freq hists
        hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'],
                                                     'all-mean-freq')
        plotting.draw(hist,
                      'float',
                      plotname='all-mean-freq',
                      plotdir=plotdir,
                      stats='mean',
                      bounds=(0.0, 0.4),
                      write_csv=True)
        for region in utils.regions:
            hist = plotting.make_hist_from_my_hist_class(
                self.mean_rates[region], region + '-mean-freq')
            plotting.draw(hist,
                          'float',
                          plotname=region + '-mean-freq',
                          plotdir=plotdir,
                          stats='mean',
                          bounds=(0.0, 0.4),
                          write_csv=True)
        check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])

        # then write html file and fix permissiions
        for region in utils.regions:
            check_call(
                ['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
            check_call([
                './bin/makeHtml', plotdir + '/' + region + '-per-base', '1',
                'null', 'png'
            ])
        check_call(
            ['./bin/permissify-www', plotdir]
        )  # NOTE this should really permissify starting a few directories higher up
Example #39
0
def read_mute_freqs_with_weights(
    indir,
    approved_genes,
    debug=False
):  # it would be nice to eventually align the genes before combining
    # returns:
    #  - mute_freqs: inverse error-weighted average mute freq over all genes for each position
    #     - also includes weighted and unweigthed means over positions

    if len(approved_genes) == 0:
        raise Exception('no approved genes')

    if approved_genes[0] == glutils.dummy_d_genes[utils.get_locus(
            approved_genes[0])]:
        return {'overall_mean': 0.5, 'unweighted_overall_mean': 0.5}

    if debug:
        print '    reading mute freqs from %s for %d gene%s: %s' % (
            indir, len(approved_genes), utils.plural(
                len(approved_genes)), utils.color_genes(approved_genes))

    # add an observation for each position, for each gene where we observed that position NOTE this would be more sensible if they were aligned first
    observed_freqs = {}
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with open(mutefname, 'r') as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(
                    line['lo_err']
                )  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful

                if freq < utils.eps or abs(
                        1.0 - freq
                ) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)

                if pos not in observed_freqs:
                    observed_freqs[pos] = []

                observed_freqs[pos].append({
                    'freq':
                    freq,
                    'err':
                    max(abs(freq - lo_err), abs(freq - hi_err))
                })  # append one for each gene

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations [i.e. genes] for each position
    mute_freqs = {}
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:  # loop over genes
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq

    # NOTE I'm sure that this weighting scheme makes sense for comparing differeing genes at the same position, but I'm less sure it makes sense for the overall mean. But, I don't want to track down all the places that changing it might affect right now
    mute_freqs['overall_mean'] = 0.
    weighted_denom = sum([
        1. / obs['err'] for pos in observed_freqs
        for obs in observed_freqs[pos]
    ])
    if weighted_denom > 0.:
        mute_freqs['overall_mean'] = sum([
            obs['freq'] / obs['err'] for pos in observed_freqs
            for obs in observed_freqs[pos]
        ]) / weighted_denom

    # I need the inverse-error-weighted numbers to sensibly combine genes, but then I also need unweigthed values that I can easily write to the yaml files for other people to use
    mute_freqs['unweighted_overall_mean'] = 0.
    unweighted_denom = sum(
        [len(observed_freqs[pos]) for pos in observed_freqs])
    if unweighted_denom > 0.:
        mute_freqs['unweighted_overall_mean'] = sum([
            obs['freq'] for pos in observed_freqs
            for obs in observed_freqs[pos]
        ]) / unweighted_denom

    if debug:
        iskipstart = 35  # i.e. for v genes skip the middle positions
        positions = sorted(observed_freqs)
        if len(positions) > 2 * iskipstart:
            print '      %s%s%s' % (' '.join([
                ('%4d' % p) for p in positions[:iskipstart]
            ]), utils.color('blue', ' [...] '), ' '.join([
                ('%4d' % p) for p in positions[len(positions) - iskipstart:]
            ]))
            print '      %s%s%s' % (' '.join([
                ('%4.2f' % mute_freqs[p]) for p in positions[:iskipstart]
            ]), utils.color('blue', ' [...] '), ' '.join(
                [('%4.2f' % mute_freqs[p])
                 for p in positions[len(positions) - iskipstart:]]))
        else:
            print '      %s' % ' '.join([('%4d' % p) for p in positions])
            print '      %s' % ' '.join([('%4.2f' % mute_freqs[p])
                                         for p in positions])
        print '        overall mean: %5.3f (unweighted %5.3f)' % (
            mute_freqs['overall_mean'], mute_freqs['unweighted_overall_mean'])

    return mute_freqs
    def plot(self,
             plotdir,
             subset_by_gene=False,
             cyst_positions=None,
             tryp_positions=None):
        print '  plotting parameters'
        start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and (
                        '_del' in column or column == 'vd_insertion'
                        or column == 'dj_insertion'
                ):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[
                        1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[
                        1]  #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and (
                    '_del' in column or column == 'vd_insertion'
                    or column == 'dj_insertion'
            ):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots',
                               multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(
                        gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist,
                                  var_type,
                                  plotname=plotname,
                                  plotdir=thisplotdir,
                                  errors=True,
                                  write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(
                    ['./bin/permissify-www', thisplotdir]
                )  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values,
                                                          var_type,
                                                          plotname,
                                                          sort=True)
            plotting.draw(hist,
                          var_type,
                          plotname=plotname,
                          plotdir=plotdir,
                          errors=True,
                          write_csv=True)

        self.mutefreqer.plot(
            plotdir, cyst_positions, tryp_positions
        )  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(
                ['./bin/permissify-www', plotdir]
            )  # NOTE this should really permissify starting a few directories higher up

        print '    parameter plot time: %.3f' % (time.time() - start)
def read_mute_info(
    indir,
    this_gene,
    approved_genes=None
):  # NOTE this would probably be more accurate if we made some effort to align the genes before combining all the approved ones
    if approved_genes == None:
        approved_genes = [
            this_gene,
        ]
    observed_freqs, observed_counts = {}, {}
    total_counts = 0
    # add an observation for each position, for each gene where we observed that position
    for gene in approved_genes:
        mutefname = indir + '/mute-freqs/' + utils.sanitize_name(gene) + '.csv'
        if not os.path.exists(mutefname):
            continue
        with opener('r')(mutefname) as mutefile:
            reader = csv.DictReader(mutefile)
            for line in reader:
                pos = int(line['position'])
                freq = float(line['mute_freq'])
                lo_err = float(
                    line['lo_err']
                )  # NOTE lo_err in the file is really the lower *bound*
                hi_err = float(line['hi_err'])  #   same deal
                assert freq >= 0.0 and lo_err >= 0.0 and hi_err >= 0.0  # you just can't be too careful
                if freq < utils.eps or abs(
                        1.0 - freq
                ) < utils.eps:  # if <freq> too close to 0 or 1, replace it with the midpoint of its uncertainty band
                    freq = 0.5 * (lo_err + hi_err)
                if pos not in observed_freqs:
                    observed_freqs[pos] = []
                    observed_counts[pos] = {n: 0 for n in utils.nukes}
                observed_freqs[pos].append({
                    'freq':
                    freq,
                    'err':
                    max(abs(freq - lo_err), abs(freq - hi_err))
                })
                for nuke in utils.nukes:
                    observed_counts[pos][nuke] += int(line[nuke + '_obs'])
                total_counts += int(line[nuke + '_obs'])

    # set final mute_freqs[pos] to the (inverse error-weighted) average over all the observations for each position
    mute_freqs = {}
    overall_total, overall_sum_of_weights = 0.0, 0.0  # also calculate the mean over all positions
    for pos in observed_freqs:
        total, sum_of_weights = 0.0, 0.0
        for obs in observed_freqs[pos]:
            assert obs['err'] > 0.0
            weight = 1.0 / obs['err']
            total += weight * obs['freq']
            sum_of_weights += weight
        assert sum_of_weights > 0.0
        mean_freq = total / sum_of_weights
        mute_freqs[pos] = mean_freq
        overall_total += total
        overall_sum_of_weights += sum_of_weights

    mute_freqs['overall_mean'] = 0.
    if overall_sum_of_weights > 0.:
        mute_freqs['overall_mean'] = overall_total / overall_sum_of_weights
    observed_counts['total_counts'] = total_counts
    return mute_freqs, observed_counts
Example #42
0
    def _export_control_sequence(self, sequence):
        VERSION = "1.0"
        TYPE = "transition_id" if self.export_transition_id else "transition_name"

        if sequence:
            dialog = gtk.FileChooserDialog(
                "Export Control Sequence", self.app.window,
                gtk.FILE_CHOOSER_ACTION_SAVE,
                (gtk.STOCK_CANCEL, gtk.RESPONSE_CANCEL, gtk.STOCK_SAVE,
                 gtk.RESPONSE_OK))
            dialog.set_default_response(gtk.RESPONSE_OK)
            dialog.set_current_name("{0}.kcs.xml".format(sequence.name))

            skcs_filter = gtk.FileFilter()  # Kaira Control Sequence
            skcs_filter.set_name("Control Sequence (.kcs.xml)")
            dialog.add_filter(skcs_filter)

            try:
                response = dialog.run()
                filename = dialog.get_filename()
            finally:
                dialog.destroy()

            net = self.project.build_net

            transitions = {}
            for t in net.transitions():
                transitions["#{0}".format(t.id)] = t
            for t in net.transitions():
                transitions[utils.sanitize_name(t.get_name())] = t

            running_transitions = {}
            if response == gtk.RESPONSE_OK:
                cmdlines = "\n"
                for command in sequence.commands:
                    match = command_parser.match(command)
                    if match is None:
                        raise ControlSequenceException("Invalid format: ",
                                                       command)

                    process = int(match.group("process"))
                    action = match.group("action")

                    if action == "T" or action == "S":
                        arg = match.group("arg_int")
                        if arg is None:
                            arg = match.group("arg_str")
                        if not transitions.has_key(arg):
                            raise ControlSequenceException(
                                "Transition '{0}' not found.".format(arg))

                        t = transitions[arg]
                        if self.export_transition_id:
                            tid = t.id
                        else:
                            tid = utils.sanitize_name(t.get_name_or_id())
                        cmdlines += "{0} {1} {2}\n".format(
                            process, action, tid)
                        if action == "S":
                            if running_transitions.has_key(process):
                                running_transitions[process].push(t.id)
                            else:
                                running_transitions[process] = [t.id]
                    elif action == "R":
                        arg_int = match.group("arg_int")
                        if arg_int is None:
                            raise ControlSequenceException(
                                "Invalid format of receive.")
                        cmdlines += "{0}\n".format(command)
                    else:
                        assert action == "F"
                        if not running_transitions.has_key(process) or \
                                not running_transitions[process]:
                            raise ControlSequenceException(
                                "Invalid sequence. Transition fire action is missing."
                            )

                        tid = running_transitions[process].pop()
                        cmdlines += "{0} {1} {2}\n".format(
                            process, action, tid)

                element = xml.Element("sequence")
                element.set("name", sequence.name)
                element.set("type", TYPE)
                element.set("version", VERSION)
                element.text = cmdlines

                tree = xml.ElementTree(element)
                tree.write(filename)
Example #43
0
    def finalize(self, calculate_uncertainty=True):
        """ convert from counts to mut freqs """
        assert not self.finalized

        self.n_cached, self.n_not_cached = 0, 0
        for gene in self.counts:
            self.freqs[gene], self.plotting_info[gene] = {}, []
            # NOTE <counts> hold the overall (not per-base) frequencies, while <freqs> holds the per-base frequencies
            counts, freqs, plotting_info = self.counts[gene], self.freqs[
                gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            for position in sorted_positions:
                freqs[position] = {}
                plotting_info.append({})
                plotting_info[-1]['name'] = utils.sanitize_name(
                    gene) + '_' + str(position)
                plotting_info[-1]['nuke_freqs'] = {}
                n_conserved, n_mutated = 0, 0
                for nuke in utils.nukes:
                    nuke_freq = float(
                        counts[position][nuke]) / counts[position]['total']
                    freqs[position][nuke] = nuke_freq
                    plotting_info[-1]['nuke_freqs'][nuke] = nuke_freq
                    if calculate_uncertainty:  # it's kinda slow
                        errs = fraction_uncertainty.err(
                            counts[position][nuke], counts[position]['total'])
                        if errs[2]:
                            self.n_cached += 1
                        else:
                            self.n_not_cached += 1
                        # print nuke_freq, errs[0], errs[1], '(', counts[position][nuke], ',', counts[position]['total'], ')'
                        assert errs[
                            0] <= nuke_freq  # these checks are probably unnecessary. EDIT and totally saved my ass about ten minutes after writing the previous statement
                        assert nuke_freq <= errs[1]
                        freqs[position][nuke + '_lo_err'] = errs[0]
                        freqs[position][nuke + '_hi_err'] = errs[1]

                    if nuke == counts[position]['gl_nuke']:
                        n_conserved += counts[position][nuke]
                    else:
                        n_mutated += counts[position][nuke]  # sum over A,C,G,T
                    # uncert = fraction_uncertainty.err(obs, total)  # uncertainty for each nuke
                counts[position]['freq'] = float(
                    n_mutated) / counts[position]['total']
                mutated_fraction_err = (0.0, 0.0)
                if calculate_uncertainty:  # it's kinda slow
                    mutated_fraction_err = fraction_uncertainty.err(
                        n_mutated, counts[position]['total'])
                    if mutated_fraction_err[2]:
                        self.n_cached += 1
                    else:
                        self.n_not_cached += 1
                counts[position]['freq_lo_err'] = mutated_fraction_err[0]
                counts[position]['freq_hi_err'] = mutated_fraction_err[1]

        self.mean_rates['all'].normalize(
            overflow_warn=False
        )  # we expect overflows in mute freq hists, so no need to warn us
        for region in utils.regions:
            self.mean_rates[region].normalize(overflow_warn=False)

        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         self.tmpcounts[gene][position]['muted'].divide_by(self.tmpcounts[gene][position]['total'], debug=False)

        self.finalized = True