Esempio n. 1
0
    def __init__(self, args, base_plotdir, skip_boring_states=''):
        raise Exception('needs to be converted off root')
        self.base_plotdir = base_plotdir
        self.skip_boring_states = skip_boring_states
        plot_types = ('transitions', 'emissions')
        for ptype in plot_types:
            plotdir = self.base_plotdir + '/' + ptype + '/plots'
            utils.prep_dir(plotdir, wildlings='*.png')

        if args.hmmdir != None:
            filelist = glob.glob(args.hmmdir + '/*.yaml')
        else:
            filelist = utils.get_arg_list(args.infiles)
        if len(filelist) == 0:
            print 'ERROR zero files passed to modelplotter'
            sys.exit()
        for infname in filelist:
            gene_name = os.path.basename(infname).replace('.yaml', '')  # the sanitized name, actually
# # ----------------------------------------------------------------------------------------
#             if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name:
#                 continue
# # ----------------------------------------------------------------------------------------
            with open(infname) as infile:
                model = yaml.load(infile)
                self.make_transition_plot(gene_name, model)
                self.make_emission_plot(gene_name, model)

        for ptype in plot_types:
            check_call(['./bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null', 'png'])
        check_call(['./bin/permissify-www', self.base_plotdir])
Esempio n. 2
0
    def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None):
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg'])

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best' : plotting.get_cluster_size_hist(partition)}
            self.plot_within_vs_between_hists(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best' : plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x')

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 3
0
 def clean_plots(self, plotdir):
     self.mfreqer.clean_plots(plotdir + '/mute-freqs')
     utils.prep_dir(plotdir + '/overall', wildlings=('*.csv', '*.svg'))
     for column in self.counts:
         if column in self.columns_to_subset_by_gene:
             thisplotdir = plotdir + '/' + column
             utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])
Esempio n. 4
0
    def plot(self, plotdir, only_csv=False):
        utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root'])
        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                errs = fraction_uncertainty.err(right, right+wrong)
                print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                # TODO this is dumb... I should make the integer-valued ones histograms as well
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
                log = ''
                if column.find('hamming_to_true_naive') >= 0:  # TODO why doesn't this just use the config dicts in plotheaders or wherever?
                    hist.title = 'hamming distance'
                else:
                    hist.title = 'inferred - true'
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)
        for column in self.hists:
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)

        if not only_csv:
            plotting.make_html(plotdir)
Esempio n. 5
0
 def clean_plots(self, plotdir, subset_by_gene):
     self.mfreqer.clean_plots(plotdir + '/mute-freqs')
     utils.prep_dir(plotdir + '/overall')  #, multilings=('*.csv', '*.svg'))
     for column in self.counts:
         if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
             thisplotdir = plotdir + '/' + column
             utils.prep_dir(thisplotdir, wildlings=['*.csv', '*.svg'])
Esempio n. 6
0
 def prep_cmdfo(iclust, seqfos, queries_to_include, color_scale_vals,
                title):
     subworkdir = '%s/mds-%d' % (self.args.workdir, iclust)
     utils.prep_dir(subworkdir)
     tmpfname = '%s/seqs.fa' % subworkdir
     with open(tmpfname, 'w') as tmpfile:
         for sfo in seqfos:
             csval = None
             if sfo['name'] in color_scale_vals:
                 csval = color_scale_vals[sfo['name']]
             tmpfile.write(
                 '>%s%s\n%s\n' %
                 (sfo['name'],
                  (' %d' % csval) if csval is not None else '',
                  sfo['seq']))
     cmdstr = './bin/mds-run.py %s --aligned --plotdir %s --plotname %s --workdir %s --seed %d' % (
         tmpfname, plotdir, get_fname(iclust), subworkdir,
         self.args.seed)
     if queries_to_include is not None:
         cmdstr += ' --queries-to-include %s' % ':'.join(
             queries_to_include)
     if title is not None:
         cmdstr += ' --title=%s' % title.replace(' ', '@')
     return {
         'cmd_str': cmdstr,
         'workdir': subworkdir,
         'outfname': '%s/%s.svg' % (plotdir, get_fname(iclust)),
         'workfnames': [tmpfname]
     }
Esempio n. 7
0
    def write(self, base_outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        outdir = base_outdir + '/mute-freqs'
        utils.prep_dir(outdir, '*.csv')

        for gene in self.counts:
            counts, freqs, plotting_info = self.counts[gene], self.freqs[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with opener('w')(outfname) as outfile:
                nuke_header = []
                for nuke in utils.nukes:
                    nuke_header.append(nuke)
                    nuke_header.append(nuke + '_lo_err')
                    nuke_header.append(nuke + '_hi_err')
                writer = csv.DictWriter(outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') + tuple(nuke_header))
                writer.writeheader()
                for position in sorted_positions:
                    row = {'position':position,
                           'mute_freq':counts[position]['freq'],
                           'lo_err':counts[position]['freq_lo_err'],
                           'hi_err':counts[position]['freq_hi_err']}
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke + '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke + '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(mean_freq_outfname.replace('REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(mean_freq_outfname.replace('REGION', region))
    def __init__(self, args, base_plotdir, skip_boring_states=''):
        self.base_plotdir = base_plotdir
        self.skip_boring_states = skip_boring_states
        plot_types = ('transitions', 'emissions', 'pair-emissions')
        for ptype in plot_types:
            plotdir = self.base_plotdir + '/' + ptype + '/plots'
            utils.prep_dir(plotdir, '*.png')

        if args.hmmdir != None:
            filelist = glob.glob(args.hmmdir + '/*.yaml')
        else:
            filelist = utils.get_arg_list(args.infiles)
        if len(filelist) == 0:
            print 'ERROR zero files passed to modelplotter'
            sys.exit()
        for infname in filelist:
            gene_name = os.path.basename(infname).replace(
                '.yaml', '')  # the sanitized name, actually
            # # ----------------------------------------------------------------------------------------
            #             if utils.get_region(gene_name) == 'v' and 'IGHV4-39_star_' not in gene_name:
            #                 continue
            # # ----------------------------------------------------------------------------------------
            with open(infname) as infile:
                model = yaml.load(infile)
                self.make_transition_plot(gene_name, model)
                self.make_emission_plot(gene_name, model)
                # self.make_pair_emission_plot(gene_name, model)

        for ptype in plot_types:
            check_call([
                './bin/makeHtml', self.base_plotdir + '/' + ptype, '1', 'null',
                'png'
            ])
        check_call(['./bin/permissify-www', self.base_plotdir])
Esempio n. 9
0
    def write_vdjalign_input(self, base_infname, n_procs):
        n_remaining = len(self.remaining_queries)
        queries_per_proc = float(n_remaining) / n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        written_queries = set()  # make sure we actually write each query TODO remove this when you work out where they're disappearing to
        if n_procs == 1:  # double check for rounding problems or whatnot
            assert n_queries_per_proc == n_remaining
        for iproc in range(n_procs):
            workdir = self.subworkdir(iproc, n_procs)
            if n_procs > 1:
                utils.prep_dir(workdir)
            with opener('w')(workdir + '/' + base_infname) as sub_infile:
                iquery = 0
                for query_name in self.remaining_queries:  # NOTE this is wasteful to loop of all the remaining queries for each process... but maybe not that wasteful
                    if iquery >= n_remaining:
                        break
                    if iquery < iproc*n_queries_per_proc or iquery >= (iproc + 1)*n_queries_per_proc:  # not for this process
                        iquery += 1
                        continue
                    sub_infile.write('>' + query_name + ' NUKES\n')

                    seq = self.input_info[query_name]['seq']
                    if query_name in self.info['indels']:
                        seq = self.info['indels'][query_name]['reversed_seq']  # use the query sequence with shm insertions and deletions reversed
                    sub_infile.write(seq + '\n')
                    written_queries.add(query_name)
                    iquery += 1
        not_written = self.remaining_queries - written_queries
        if len(not_written) > 0:
            raise Exception('didn\'t write %s to %s' % (':'.join(not_written), self.args.workdir))
Esempio n. 10
0
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None):
        print '  plotting parameters'
        start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(['./bin/permissify-www', thisplotdir])  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True)

        self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up

        print '    parameter plot time: %.3f' % (time.time()-start)
Esempio n. 11
0
 def clean_plots(self, plotdir):
     self.mfreqer.clean_plots(plotdir + "/mute-freqs")
     utils.prep_dir(plotdir + "/overall")  # , multilings=('*.csv', '*.svg'))
     for column in self.counts:
         if column in self.columns_to_subset_by_gene:
             thisplotdir = plotdir + "/" + column
             utils.prep_dir(thisplotdir, wildlings=["*.csv", "*.svg"])
 def plot(self, plotdir):
     utils.prep_dir(plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root'])
     for column in self.values:
         if self.only_correct_gene_fractions and column not in bool_columns:
             continue
         if column in bool_columns:
             right = self.values[column]['right']
             wrong = self.values[column]['wrong']
             errs = fraction_uncertainty.err(right, right+wrong)
             print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
             hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
             plotting.draw(hist, 'bool', plotname=column, plotdir=plotdir, write_csv=True)
         else:
             # TODO this is dumb... I should make the integer-valued ones histograms as well
             hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
             log = ''
             if column.find('hamming_to_true_naive') >= 0:
                 hist.GetXaxis().SetTitle('hamming distance')
             else:
                 hist.GetXaxis().SetTitle('inferred - true')
             plotting.draw(hist, 'int', plotname=column, plotdir=plotdir, write_csv=True, log=log)
     for column in self.hists:
         hist = plotting.make_hist_from_my_hist_class(self.hists[column], column)
         plotting.draw(hist, 'float', plotname=column, plotdir=plotdir, write_csv=True, log=log)
     
     check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
     check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Esempio n. 13
0
    def plot(self, base_plotdir, only_csv=False):
        if not self.finalized:
            self.finalize(debug=debug)

        plotdir = base_plotdir + '/allele-finding'

        for old_gene_dir in glob.glob(plotdir + '/*'):  # has to be a bit more hackey than elsewhere, since we have no way of knowing what genes might have had their own directories written last time we wrote to this dir
            if not os.path.isdir(old_gene_dir):
                raise Exception('not a directory: %s' % old_gene_dir)
            utils.prep_dir(old_gene_dir, wildlings=('*.csv', '*.svg'))
            os.rmdir(old_gene_dir)
        utils.prep_dir(plotdir, wildlings=('*.csv', '*.svg'))

        if only_csv:  # not implemented
            return

        start = time.time()
        for gene in self.plotvals:
            if utils.get_region(gene) != 'v':
                continue

            for position in self.plotvals[gene]:
                if position not in self.fitted_positions[gene]:  # we can make plots for the positions we didn't fit, but there's a *lot* of them and they're slow
                    continue
                # if 'allele-finding' not in self.TMPxyvals[gene][position] or self.TMPxyvals[gene][position]['allele-finding'] is None:
                #     continue
                plotting.make_allele_finding_plot(plotdir + '/' + utils.sanitize_name(gene), gene, position, self.plotvals[gene][position])
        print '      allele finding plot time: %.1f' % (time.time()-start)
Esempio n. 14
0
    def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None):
        print '  plotting parameters'
        # start = time.time()
        utils.prep_dir(plotdir + '/plots')  #, multilings=('*.csv', '*.svg'))
        for column in self.counts:
            if column == 'all':
                continue
            values, gene_values = {}, {}
            if len(self.counts[column]) == 0:
                print 'ERROR no counts in %s' % column
                assert False
            for index, count in self.counts[column].iteritems():
                gene = None
                if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                    if '_del' in column:
                        region = column[0]
                    else:
                        region = column[1]
                    assert region in utils.regions
                    assert 'IGH' + region.upper() in index[1]  # NOTE this is hackey, but it works find now and will fail obviously
                    gene = index[1]                            #   if I ever change the correlations to be incompatible. so screw it
                    if gene not in gene_values:
                        gene_values[gene] = {}

                column_val = index[0]
                if gene is not None:
                    if column_val not in gene_values[gene]:
                        gene_values[gene][column_val] = 0.0
                    gene_values[gene][column_val] += count
                if column_val not in values:
                    values[column_val] = 0.0
                values[column_val] += count

                try:  # figure out whether this is an integer or string (only used outside this loop when we make the plots)
                    int(column_val)
                    var_type = 'int'
                except:
                    var_type = 'string'

            if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'):  # option to subset deletion and (real) insertion plots by gene
                thisplotdir = plotdir + '/' + column
                utils.prep_dir(thisplotdir + '/plots', multilings=['*.csv', '*.svg'])
                for gene in gene_values:
                    plotname = utils.sanitize_name(gene) + '-' + column
                    hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True)
                    plotting.draw(hist, var_type, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True)
                check_call(['./bin/makeHtml', thisplotdir, '3', 'null', 'svg'])
                check_call(['./bin/permissify-www', thisplotdir])  # NOTE this should really permissify starting a few directories higher up

            plotname = column
            hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True)
            plotting.draw(hist, var_type, plotname=plotname, plotdir=plotdir, errors=True, write_csv=True)

        self.mutefreqer.plot(plotdir, cyst_positions, tryp_positions)  #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files

        if has_root:
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Esempio n. 15
0
    def __init__(self, args, seed, sublabel=None):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('parameter dir ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {}  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.glfo = utils.read_germline_set(self.args.datadir)

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' + utils.get_parameter_fname('all'))
        self.allowed_genes = self.get_allowed_genes(self.args.parameter_dir)  # only really used if <self.args.uniform_vj_choice_probs> is set, but it also checks the sensibility of <self.args.only_genes>
        self.insertion_content_probs = None
        self.read_insertion_content()

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, self.args.parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname)
        with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        if not self.args.no_clean:
            os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Esempio n. 16
0
    def __init__(self, args, seed, sublabel=None):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(self.args.outfname)
        utils.prep_dir(self.workdir)

        if not self.args.simulate_partially_from_scratch:
            parameter_dir = self.args.parameter_dir
        else:  # we start from scratch, except for the mute freq stuff
            parameter_dir = self.args.scratch_mute_freq_dir

        if parameter_dir is None or not os.path.exists(parameter_dir):
            raise Exception('parameter dir ' + parameter_dir + ' d.n.e')

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        self.glfo = glutils.read_glfo(self.args.initial_datadir, self.args.chain, only_genes=self.args.only_genes)

        self.allowed_genes = self.get_allowed_genes(parameter_dir)  # set of genes a) for which we read per-position mutation information and b) from which we choose when running partially from scratch
        self.version_freq_table = self.read_vdj_version_freqs(parameter_dir)  # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data
        self.insertion_content_probs = self.read_insertion_content(parameter_dir)
        self.all_mute_freqs = {}
        self.parameter_dir = parameter_dir  # damnit, I guess I do need to save this in self

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname)
        with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Esempio n. 17
0
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None, debug=False):
    # NOTE set <n_components> to None to run plain kmeans, without mds TODO clean this up

    start = time.time()
    assert n_clusters is not None
    if 'sklearn' not in sys.modules:
        from sklearn import manifold  # these are both slow af to import, even on local ssd
        from sklearn.cluster import KMeans

    if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos):
        raise Exception('duplicate sequence ids in <seqfos>')

    if not aligned:  # NOTE unlike the bios2mds version above, this modifies <seqfos>
        if debug:
            print 'align'
        seqfos = utils.align_many_seqs(seqfos)

    if debug:
        print '  distances'
    # translations = string.maketrans('ACGT-', '01234')
    # def convert(seq):
    #     return [int(c) for c in seq.translate(translations)]
    # converted_seqs = [convert(x['seq']) for x in seqfos]
    # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming')
    # similarities = scipy.spatial.distance.squareform(similarities)
    similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))])
    random_state = numpy.random.RandomState(seed=seed)

    pos = None
    if n_components is not None:
        if debug:
            print '  mds'
        mds = sys.modules['sklearn'].manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs)
        pos = mds.fit_transform(similarities)
        # pos = mds.fit(similarities).embedding_

    if debug:
        print '    kmeans clustering with %d clusters' % n_clusters
    kmeans = sys.modules['sklearn'].cluster.KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos if pos is not None else similarities)
    pcvals = {seqfos[iseq]['name'] : pos[iseq] if pos is not None else None for iseq in range(len(seqfos))}
    labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))}
    partition = utils.group_seqs_by_value(pcvals.keys(), lambda q: labels[q])

    if plotdir is not None:
        utils.prep_dir(plotdir, wildlings=['*.svg'])
        if debug:
            print '    plot'
        plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition)

        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels)

    if debug:
        print '    kmeans time %.1f' % (time.time() - start)

    return partition
Esempio n. 18
0
    def plot(self, plotdir, only_csv=False):
        print '  plotting performance',
        import fraction_uncertainty
        import plotting
        start = time.time()
        for substr in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))

        for column in self.values:
            if column in plotconfig.gene_usage_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                lo, hi = fraction_uncertainty.err(right, right + wrong)
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False)
                if 'hamming_to_true_naive' in column:
                    xtitle = 'hamming distance'
                    tmpplotdir = plotdir + '/mutation'
                else:
                    xtitle = 'inferred - true'
                    if 'muted' in column:
                        tmpplotdir = plotdir + '/mutation'
                    else:
                        tmpplotdir = plotdir + '/boundaries'
                plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True)

        for column in self.hists:
            if '_vs_mute_freq' in column or '_vs_per_gene_support' in column:  # only really care about the fraction, which we plot below
                continue
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True)

        # fraction correct vs mute freq
        for region in utils.regions:
            hright = self.hists[region + '_gene_right_vs_mute_freq']
            hwrong = self.hists[region + '_gene_wrong_vs_mute_freq']
            if hright.integral(include_overflows=True) == 0:
                continue
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True)

        # per-gene support stuff
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            hright = self.hists[region + '_allele_right_vs_per_gene_support']
            hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support']
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr, n_columns=4)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 19
0
    def __init__(self, args, glfo, seed, workdir, outfname):  # NOTE <gldir> is not in general the same as <args.initial_germline_dir>
        self.args = args
        self.glfo = glfo

        # NOTE in general *not* the same as <self.args.workdir> and <self.args.outfname>
        self.workdir = workdir
        self.outfname = outfname
        utils.prep_dir(self.workdir)

        # set <self.parameter_dir> (note that this is in general *not* the same as self.args.parameter_dir)
        if self.args.rearrange_from_scratch:  # currently not allowed to mutate from scratch without also rearranging from scratch (enforced in bin/partis)
            if self.args.mutate_from_scratch:
                self.parameter_dir = None
            else:
                self.parameter_dir = self.args.scratch_mute_freq_dir  # if you make up mute freqs from scratch, unless you're really careful you tend to get nonsense results for a lot of things (e.g. allele finding). So it's easier to copy over a reasonable set of mut freq parameters from somewhere.
        else:
            self.parameter_dir = self.args.parameter_dir + '/' + self.args.parameter_type

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(args.allele_prevalence_fname) if args.allele_prevalence_fname is not None else {}
        self.version_freq_table = self.read_vdj_version_freqs()  # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch)
        self.insertion_content_probs = self.read_insertion_content()  # dummy/uniform if rearranging from scratch
        self.all_mute_freqs = {}

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with opener('r')(self.args.gtrfname) as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, self.parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname)
        with opener('r')(self.treefname) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Esempio n. 20
0
    def plot(self, plotdir, only_csv=False):
        print '  plotting performance',
        start = time.time()
        for substr in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))

        for column in self.values:
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                lo, hi, _ = fraction_uncertainty.err(right, right + wrong)
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False)
                if 'hamming_to_true_naive' in column:
                    xtitle = 'hamming distance'
                    tmpplotdir = plotdir + '/mutation'
                else:
                    xtitle = 'inferred - true'
                    if 'muted' in column:
                        tmpplotdir = plotdir + '/mutation'
                    else:
                        tmpplotdir = plotdir + '/boundaries'
                plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True)

        for column in self.hists:
            if '_vs_mute_freq' in column or '_vs_per_gene_support' in column:  # only really care about the fraction, which we plot below
                continue
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True)

        # fraction correct vs mute freq
        for region in utils.regions:
            hright = self.hists[region + '_gene_right_vs_mute_freq']
            hwrong = self.hists[region + '_gene_wrong_vs_mute_freq']
            if hright.integral(include_overflows=True) == 0:
                continue
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True)

        # per-gene support stuff
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            hright = self.hists[region + '_allele_right_vs_per_gene_support']
            hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support']
            plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True)

        if not only_csv:  # write html file and fix permissiions
            for substr in self.subplotdirs:
                plotting.make_html(plotdir + '/' + substr, n_columns=4)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 21
0
def make_mean_plots(plotdir, subdirs, outdir):
    meanlist, variancelist = [], []
    normalized_means = []
    for sd in subdirs:
        with opener('r')(plotdir + '/' + sd + '/plots/means.csv') as meanfile:
            reader = csv.DictReader(meanfile)
            for line in reader:
                means = [float(m) for m in line['means'].split(':')]
                meanlist.append(numpy.mean(means))
                variancelist.append(numpy.var(means))
                nmvals = [
                    float(nm) for nm in line['normalized-means'].split(':')
                ]
                normalized_means += nmvals

    import matplotlib
    matplotlib.use('Agg')
    from matplotlib import pyplot

    # ----------------------------------------------------------------------------------------
    # first make hexbin plot
    pyplot.subplot(111)
    pyplot.hexbin(meanlist,
                  variancelist,
                  gridsize=20,
                  cmap=matplotlib.cm.gist_yarg,
                  bins=None)
    # pyplot.axis([0, 5, 0, 2])
    pyplot.xlabel('mean')
    pyplot.ylabel('variance')

    cb = pyplot.colorbar()
    cb.set_label('mean value')
    utils.prep_dir(outdir + '/plots', multilings=['*.png', '*.svg', '*.csv'])
    pyplot.savefig(outdir + '/plots/hexmeans.png')
    pyplot.clf()

    # ----------------------------------------------------------------------------------------
    # then make normalized mean plot
    n, bins, patches = pyplot.hist(normalized_means, 50)
    pyplot.xlabel(r'$(x_i - \mu) / \sigma_i$')
    pyplot.title(r'$\sigma=' + str(math.sqrt(numpy.var(normalized_means))) +
                 '$')
    # pyplot.axis([-10, 10, 0, 220])

    pyplot.savefig(outdir + '/plots/means.png')

    check_call(
        ['./permissify-www', outdir]
    )  # NOTE this should really permissify starting a few directories higher up
Esempio n. 22
0
    def plot(self,
             plotdir,
             partition=None,
             infiles=None,
             annotations=None,
             only_csv=None):
        import plotting
        print '  plotting partitions'
        sys.stdout.flush()
        start = time.time()
        for subdir in self.subplotdirs:
            utils.prep_dir(plotdir + '/' + subdir,
                           wildlings=['*.csv', '*.svg'])

        fnames = []

        if partition is not None:  # one partition
            assert infiles is None
            assert annotations is not None
            csize_hists = {'best': plotting.get_cluster_size_hist(partition)}
            # self.plot_within_vs_between_hists(partition, annotations, plotdir)
            fnames += self.plot_size_vs_shm(partition, annotations, plotdir)
        elif infiles is not None:  # plot the mean of a partition from each file
            subset_hists = []
            for fname in infiles:
                cp = ClusterPath()
                cp.readfile(fname)
                subset_hists.append(
                    plotting.get_cluster_size_hist(cp.partitions[cp.i_best]))
            csize_hists = {'best': plotting.make_mean_hist(subset_hists)}
            for ih in range(len(subset_hists)):
                subset_hists[ih].write(plotdir +
                                       ('/subset-%d-cluster-sizes.csv' % ih))
        else:
            assert False

        plotting.plot_cluster_size_hists(plotdir +
                                         '/overall/cluster-sizes.svg',
                                         csize_hists,
                                         title='',
                                         log='x')
        fnames.append(['cluster-sizes.svg'])

        if not only_csv:
            for subdir in self.subplotdirs:
                plotting.make_html(plotdir + '/' + subdir,
                                   fnames=fnames,
                                   new_table_each_row=True)

        print '(%.1f sec)' % (time.time() - start)
    def write(self, base_outdir):
        print '  writing parameters'
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        mute_start = time.time()
        self.mutefreqer.write(
            base_outdir,
            mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv'
        )  # REGION is replace by each region in the three output files)
        print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column='all')
            elif '_content' in column:
                index = [
                    column,
                ]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [
                    column,
                ] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(
                    column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '    parameter write time: %.3f' % (time.time() - start)
Esempio n. 24
0
def compare_directories(args, plotdirlist, outdir):
    utils.prep_dir(outdir, wildlings=['*.png', '*.svg', '*.csv'])

    # read hists from <plotdirlist>
    allhists = OrderedDict()
    allvars = set()  # all variables that appeared in any dir
    for idir in range(len(plotdirlist)):
        dirhists = get_hists_from_dir(plotdirlist[idir], args.names[idir])
        allvars |= set(dirhists.keys())
        allhists[args.names[idir]] = dirhists
    # then loop over all the <varname>s we found
    for varname in allvars:
        hlist = [allhists[dname].get(varname, Hist(1, 0, 1, title='null')) for dname in allhists]
        plot_single_variable(args, varname, hlist, outdir, pathnameclues=plotdirlist[0])

    plotting.make_html(outdir, n_columns=4)
Esempio n. 25
0
def compare_directories(args, plotdirlist, outdir):
    utils.prep_dir(outdir, wildlings=['*.png', '*.svg', '*.csv'])

    # read hists from <plotdirlist>
    allhists = OrderedDict()
    allvars = set()  # all variables that appeared in any dir
    for idir in range(len(plotdirlist)):
        dirhists = get_hists_from_dir(plotdirlist[idir], args.names[idir])
        allvars |= set(dirhists.keys())
        allhists[args.names[idir]] = dirhists
    # then loop over all the <varname>s we found
    for varname in allvars:
        hlist = [allhists[dname].get(varname, Hist(1, 0, 1, title='null')) for dname in allhists]
        plot_single_variable(args, varname, hlist, outdir, pathnameclues=plotdirlist[0])

    plotting.make_html(outdir, n_columns=4)
Esempio n. 26
0
    def kmeans_cluster_v_seqs(self, qr_seqs, swfo, plotdir=None, debug=False):
        if plotdir is not None:
            utils.prep_dir(plotdir, wildlings=['*.svg'], subdirs=[d for d in os.listdir(plotdir) if os.path.isdir(plotdir + '/' + d)], rm_subdirs=True)

        clusterfos = []
        if debug:
            print 'kmeans clustering'
            print '  seqs    family'
        for family, seqfos in self.get_family_groups(qr_seqs, swfo).items():
            if debug:
                print '  %5d     %s' % (len(seqfos), family)
            partition = mds.bios2mds_kmeans_cluster(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.workdir + '/mds', self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None)
            # partition = mds.run_sklearn_mds(self.n_mds_components, self.XXX_n_kmeans_clusters, seqfos, self.args.seed, reco_info=self.reco_info, region=self.region, plotdir=plotdir + '/' + family if plotdir is not None else None)
            clusterfos += self.get_clusterfos_from_partition(partition, qr_seqs)

        clusterfos = sorted(clusterfos, key=lambda c: len(c['seqfos']), reverse=True)
        return clusterfos
Esempio n. 27
0
    def __init__(self, args, base_plotdir):
        self.base_plotdir = base_plotdir

        self.eps_to_skip = 1e-3
        print 'skipping eps %f' % self.eps_to_skip

        plot_types = ('transitions', 'emissions')
        for ptype in plot_types:
            plotdir = self.base_plotdir + '/' + ptype
            utils.prep_dir(plotdir, wildlings=['*.png', '*.svg'])

        if args.hmmdir != None:
            self.filelist = glob.glob(args.hmmdir + '/*.yaml')
        else:
            self.filelist = utils.get_arg_list(args.infiles)
        if len(self.filelist) == 0:
            raise Exception('zero files passed to modelplotter')
Esempio n. 28
0
    def write(self, base_outdir, mean_freq_outfname):
        if not self.finalized:
            self.finalize()

        outdir = base_outdir + '/mute-freqs'
        utils.prep_dir(outdir, '*.csv')

        for gene in self.counts:
            counts, freqs, plotting_info = self.counts[gene], self.freqs[
                gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            outfname = outdir + '/' + utils.sanitize_name(gene) + '.csv'
            with opener('w')(outfname) as outfile:
                nuke_header = []
                for nuke in utils.nukes:
                    nuke_header.append(nuke)
                    nuke_header.append(nuke + '_obs')
                    nuke_header.append(nuke + '_lo_err')
                    nuke_header.append(nuke + '_hi_err')
                writer = csv.DictWriter(
                    outfile, ('position', 'mute_freq', 'lo_err', 'hi_err') +
                    tuple(nuke_header))
                writer.writeheader()
                for position in sorted_positions:
                    row = {
                        'position': position,
                        'mute_freq': counts[position]['freq'],
                        'lo_err': counts[position]['freq_lo_err'],
                        'hi_err': counts[position]['freq_hi_err']
                    }
                    for nuke in utils.nukes:
                        row[nuke] = freqs[position][nuke]
                        row[nuke + '_obs'] = counts[position][nuke]
                        row[nuke + '_lo_err'] = freqs[position][nuke +
                                                                '_lo_err']
                        row[nuke + '_hi_err'] = freqs[position][nuke +
                                                                '_hi_err']
                    writer.writerow(row)

        assert 'REGION' in mean_freq_outfname
        self.mean_rates['all'].write(
            mean_freq_outfname.replace(
                'REGION', 'all'))  # hackey hackey hackey replacement... *sigh*
        for region in utils.regions:
            self.mean_rates[region].write(
                mean_freq_outfname.replace('REGION', region))
Esempio n. 29
0
    def __init__(self, args, glfo, seed, workdir):  # NOTE <gldir> is not in general the same as <args.initial_germline_dir> # rm workdir
        self.args = args
        self.glfo = glfo
        if len(glfo['seqs']['v']) > 100:  # this is kind of a shitty criterion, but I don't know what would be better (we basically just want to warn people if they're simulating from data/germlines/human)
            print '  note: simulating with a very large number (%d) of V genes (the use of realistic diploid sets can be controlled either by using inferred germline sets that you\'ve got lying around (--reco-parameter-dir), or with --generate-germline-set)' % len(glfo['seqs']['v'])

        self.workdir = tempfile.mkdtemp()
        utils.prep_dir(self.workdir)

        assert self.args.parameter_dir is None
        self.reco_parameter_dir = self.args.reco_parameter_dir + '/' + self.args.parameter_type if self.args.reco_parameter_dir is not None else None
        self.shm_parameter_dir = self.args.shm_parameter_dir + '/' + self.args.parameter_type if self.args.shm_parameter_dir is not None else None

        self.index_keys = {}  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(args.allele_prevalence_fname) if args.allele_prevalence_fname is not None else {}
        self.version_freq_table = self.read_vdj_version_freqs()  # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch)
        self.insertion_content_probs = self.read_insertion_content()  # dummy/uniform if rearranging from scratch
        self.all_mute_freqs = {}

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with open(self.args.gtrfname, 'r') as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args, self.shm_parameter_dir, seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(seed, self.treefname, self.workdir)  # NOTE not really a newick file, since I hack on the per-region branch length info at the end of each line
        with open(self.treefname, 'r') as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        os.remove(self.treefname)

        self.validation_values = {'heights' : {t : {'in' : [], 'out' : []} for t in ['all'] + utils.regions}}
Esempio n. 30
0
    def write(self, base_outdir):
        print "    writing parameters",
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(
            base_outdir, subdirs=("hmms", "mute-freqs", "germline-sets"), wildlings=("*.csv", "*.yaml", "*.fasta")
        )  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(
            base_outdir + "/mute-freqs", mean_freq_outfname=base_outdir + "/REGION-mean-mute-freqs.csv"
        )  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + "_gene"].keys()]
        glutils.write_glfo(base_outdir + "/" + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == "all":
                index = tuple(list(utils.index_columns) + ["cdr3_length"])
                outfname = base_outdir + "/" + utils.get_parameter_fname(column="all")
            elif "_content" in column:
                index = [column]
                outfname = base_outdir + "/" + column + ".csv"
            else:
                index = [column] + utils.column_dependencies[column]
                outfname = base_outdir + "/" + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener("w")(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append("count")
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line["count"] = count
                    out_data.writerow(line)

        print "(%.1f sec)" % (time.time() - start)
Esempio n. 31
0
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None):
    print '%s not testing this after moving these imports down here' % utils.color('red', 'hey')
    from sklearn import manifold  # these are both slow af to import, even on local ssd
    from sklearn.cluster import KMeans

    if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos):
        raise Exception('duplicate sequence ids in <seqfos>')

    print 'align'
    if not aligned:  # NOTE unlike the bios2mds version above, this modifies <seqfos>
        seqfos = utils.align_many_seqs(seqfos)

    print '  distances'
    # translations = string.maketrans('ACGT-', '01234')
    # def convert(seq):
    #     return [int(c) for c in seq.translate(translations)]
    # converted_seqs = [convert(x['seq']) for x in seqfos]
    # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming')
    # similarities = scipy.spatial.distance.squareform(similarities)
    similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))])

    print '  mds'
    random_state = numpy.random.RandomState(seed=seed)
    mds = manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs)
    pos = mds.fit_transform(similarities)
    # pos = mds.fit(similarities).embedding_

    print '  kmeans'
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos)
    pcvals = {seqfos[iseq]['name'] : pos[iseq] for iseq in range(len(seqfos))}
    labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))}
    def keyfunc(q):  # should really integrate this with utils.collapse_naive_seqs()/utils.split_partition_with_criterion()
        return labels[q]
    partition = [list(group) for _, group in itertools.groupby(sorted(pcvals, key=keyfunc), key=keyfunc)]

    if plotdir is not None:
        utils.prep_dir(plotdir, wildlings=['*.svg'])
        print '  plot'
        plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition)

        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels)

    return partition
Esempio n. 32
0
def make_mean_plots(plotdir, subdirs, outdir):
    meanlist, variancelist = [], []
    normalized_means = []
    for sd in subdirs:
        with opener('r')(plotdir + '/' + sd + '/plots/means.csv') as meanfile:
            reader = csv.DictReader(meanfile)
            for line in reader:
                means = [ float(m) for m in line['means'].split(':') ]
                meanlist.append(numpy.mean(means))
                variancelist.append(numpy.var(means))
                nmvals = [ float(nm) for nm in line['normalized-means'].split(':') ]
                normalized_means += nmvals

    import matplotlib
    matplotlib.use('Agg')
    from matplotlib import pyplot

    # ----------------------------------------------------------------------------------------
    # first make hexbin plot
    pyplot.subplot(111)
    pyplot.hexbin(meanlist, variancelist, gridsize=20, cmap=matplotlib.cm.gist_yarg, bins=None)
    # pyplot.axis([0, 5, 0, 2])
    pyplot.xlabel('mean')
    pyplot.ylabel('variance')

    cb = pyplot.colorbar()
    cb.set_label('mean value')
    utils.prep_dir(outdir + '/plots', multilings=['*.png', '*.svg', '*.csv'])
    pyplot.savefig(outdir + '/plots/hexmeans.png')
    pyplot.clf()

    # ----------------------------------------------------------------------------------------
    # then make normalized mean plot
    n, bins, patches = pyplot.hist(normalized_means, 50)
    pyplot.xlabel(r'$(x_i - \mu) / \sigma_i$')
    pyplot.title(r'$\sigma=' + str(math.sqrt(numpy.var(normalized_means))) + '$')
    # pyplot.axis([-10, 10, 0, 220])

    pyplot.savefig(outdir + '/plots/means.png')


    check_call(['./permissify-www', outdir])  # NOTE this should really permissify starting a few directories higher up
Esempio n. 33
0
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with open(outfname, 'w') as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 34
0
def make_tree(all_genes, workdir, use_cache=False):
    aligned_fname = workdir + '/all-aligned.fa'
    raxml_label = 'xxx'
    raxml_output_fnames = [
        '%s/RAxML_%s.%s' % (workdir, fn, raxml_label)
        for fn in ['parsimonyTree', 'log', 'result', 'info', 'bestTree']
    ]
    treefname = [fn for fn in raxml_output_fnames if 'result' in fn][0]
    if use_cache:  # don't re-run muxcle & raxml, just use the previous run's output tree file
        return treefname
    utils.prep_dir(workdir,
                   wildlings=[
                       '*.' + raxml_label,
                       os.path.basename(aligned_fname), 'out', 'err',
                       os.path.basename(aligned_fname) + '.reduced'
                   ])

    # write and align an .fa with all alleles from any gl set
    with tempfile.NamedTemporaryFile() as tmpfile:
        for name, seq in all_genes.items():
            tmpfile.write('>%s\n%s\n' % (name, seq))
        tmpfile.flush()  # BEWARE if you forget this you are f****d
        cmdstr = '%s -in %s -out %s' % (args.muscle_path, tmpfile.name,
                                        aligned_fname)
        print '    %s %s' % (utils.color('red', 'run'), cmdstr)
        utils.run_cmds(get_cmdfos(cmdstr, workdir, aligned_fname),
                       ignore_stderr=True)

    # get a tree for the aligned .fa
    cmdstr = '%s -mGTRCAT -n%s -s%s -p1 -w%s' % (args.raxml_path, raxml_label,
                                                 aligned_fname, workdir)
    print '    %s %s' % (utils.color('red', 'run'), cmdstr)
    utils.run_cmds(get_cmdfos(cmdstr, workdir, treefname), ignore_stderr=True)

    os.remove(aligned_fname)  # rm muscle output
    for fn in [
            f for f in raxml_output_fnames if f != treefname
    ]:  # rm all the raxml outputs except what the one file we really want
        os.remove(fn)

    return treefname
Esempio n. 35
0
    def write(self, base_outdir, my_datadir=None):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, subdirs=('hmms', 'mute-freqs', 'germline-sets'), wildlings=('*.csv', '*.yaml', '*.fasta'))  # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary

        self.mfreqer.write(base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files)
        genes_with_counts = [g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys()]
        glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=True)

        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = tuple(list(utils.index_columns) + ['cdr3_length', ])
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
Esempio n. 36
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg'))
            # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme')  #, title=utils.sanitize_name(gene))
            for position in sorted_positions:
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), counts[position]['freq'], error=err)
            xline = None
            figsize = [3, 3]
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
                figsize[0] *= 2
            plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv)
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl

        # make mean mute freq hists
        plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)
        for region in utils.regions:
            plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)

        if not only_csv:  # write html file and fix permissiions
            check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])
            for region in utils.regions:
                check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
                # check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png'])
            check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Esempio n. 37
0
    def write_vdjalign_input(self, base_infname, n_procs):
        queries_per_proc = float(len(self.remaining_queries)) / n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        if n_procs == 1:  # double check for rounding problems or whatnot
            assert n_queries_per_proc == len(self.remaining_queries)
        for iproc in range(n_procs):
            workdir = self.args.workdir
            if n_procs > 1:
                workdir += '/sw-' + str(iproc)
                utils.prep_dir(workdir)
            with opener('w')(workdir + '/' + base_infname) as sub_infile:
                for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc):
                    if iquery >= len(self.remaining_queries):
                        break
                    query_name = self.remaining_queries[iquery]
                    sub_infile.write('>' + query_name + ' NUKES\n')

                    seq = self.input_info[query_name]['seq']
                    if query_name in self.info['indels']:
                        seq = self.info['indels'][query_name]['reversed_seq']  # use the query sequence with shm insertions and deletions reversed
                    sub_infile.write(seq + '\n')
Esempio n. 38
0
    def write(self, base_outdir):
        print '    writing parameters',
        sys.stdout.flush()
        start = time.time()

        utils.prep_dir(base_outdir, multilings=('*.csv', '*.svg'))
        # mute_start = time.time()
        self.mutefreqer.write(base_outdir, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv')  # REGION is replace by each region in the three output files) 
        # print '      mut freq write time: %.3f' % (time.time() - mute_start)
        # print ' %d / %d cached' % (self.mutefreqer.n_cached, self.mutefreqer.n_cached + self.mutefreqer.n_not_cached)
        for column in self.counts:
            index = None
            outfname = None
            if column == 'all':
                index = utils.index_columns
                outfname = base_outdir + '/' + utils.get_parameter_fname(column='all')
            elif '_content' in column:
                index = [column,]
                outfname = base_outdir + '/' + column + '.csv'
            else:
                index = [column,] + utils.column_dependencies[column]
                outfname = base_outdir + '/' + utils.get_parameter_fname(column_and_deps=index)
            if os.path.isfile(outfname):
                os.remove(outfname)
            elif not os.path.exists(base_outdir):
                os.makedirs(base_outdir)
            with opener('w')(outfname) as outfile:
                out_fieldnames = list(index)
                out_fieldnames.append('count')
                out_data = csv.DictWriter(outfile, out_fieldnames)
                out_data.writeheader()
                # NOTE this will in general not be sorted
                for key, count in self.counts[column].iteritems():
                    line = {}
                    for ic in range(len(key)):
                        line[index[ic]] = key[ic]
                    line['count'] = count
                    out_data.writerow(line)

        print '(%.1f sec)' % (time.time()-start)
    def split_input(self, n_procs, infname=None, info=None, prefix='sub'):
        """ 
        If <infname> is specified split the csv info from it into <n_procs> input files in subdirectories labelled with '<prefix>-' within <self.args.workdir>
        If <info> is specified, instead split the list <info> into pieces and return a list of the resulting lists
        """
        if info is None:
            assert infname is not None
            info = []
            with opener('r')(infname) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    info.append(line)
        else:
            assert infname is None  # make sure only *one* of 'em is specified
            outlists = []
        queries_per_proc = float(len(info)) / n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        for iproc in range(n_procs):
            if infname is None:
                outlists.append([])
            else:
                subworkdir = self.args.workdir + '/' + prefix + '-' + str(
                    iproc)
                utils.prep_dir(subworkdir)
                sub_outfile = opener('w')(subworkdir + '/' +
                                          os.path.basename(infname))
                writer = csv.DictWriter(sub_outfile, reader.fieldnames)
                writer.writeheader()
            for iquery in range(iproc * n_queries_per_proc,
                                (iproc + 1) * n_queries_per_proc):
                if iquery >= len(info):
                    break
                if infname is None:
                    outlists[-1].append(info[iquery])
                else:
                    writer.writerow(info[iquery])

        if infname is None:
            return outlists
Esempio n. 40
0
 def finalize_tigger(self):
     utils.prep_dir(os.getenv('www') + '/partis/tmp', wildling='*.svg')
     for gene in self.counts:
         if utils.get_region(gene) != 'v':
             continue
         print '\n%s' % gene
         print ' position         x-icpt                  y-icpt                   slope              mut / total'
         mean_x_icpt = {'sum' : 0., 'total' : 0.}
         for position in sorted(self.counts[gene].keys()):
             self.freqs[gene][position]['tigger'] = self.tigger_calcs(position, self.counts[gene][position], mean_x_icpt)
         print mean_x_icpt
         if mean_x_icpt['total'] > 0.:
             print mean_x_icpt['sum'] / mean_x_icpt['total']
     assert False
     for gene in self.freqs:
         if utils.get_region(gene) != 'v':
             continue
         info = {p : self.freqs[gene][p]['tigger-fits'] for p in self.freqs[gene]}
         x_intercepts = [-v['intercept'] / v['slope'] for k, v in info.items() if v['intercept'] is not None and v['intercept'] < 0.3]
         print sorted(x_intercepts)
         print sum(x_intercepts) / float(len(x_intercepts))
         print numpy.median(x_intercepts)
Esempio n. 41
0
    def write_vdjalign_input(self, base_infname, n_procs):
        queries_per_proc = float(len(self.remaining_queries)) / n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        if n_procs == 1:  # double check for rounding problems or whatnot
            assert n_queries_per_proc == len(self.remaining_queries)
        for iproc in range(n_procs):
            workdir = self.args.workdir
            if n_procs > 1:
                workdir += '/sw-' + str(iproc)
                utils.prep_dir(workdir)
            with opener('w')(workdir + '/' + base_infname) as sub_infile:
                for iquery in range(iproc * n_queries_per_proc,
                                    (iproc + 1) * n_queries_per_proc):
                    if iquery >= len(self.remaining_queries):
                        break
                    query_name = self.remaining_queries[iquery]
                    sub_infile.write('>' + query_name + ' NUKES\n')

                    seq = self.input_info[query_name]['seq']
                    if query_name in self.info['indels']:
                        seq = self.info['indels'][query_name][
                            'reversed_seq']  # use the query sequence with shm insertions and deletions reversed
                    sub_infile.write(seq + '\n')
 def __init__(self, germlines, plotdir, name):
     self.germlines = germlines
     self.plotdir = plotdir
     self.name = name
     utils.prep_dir(self.plotdir + '/plots', wildling=None, multilings=['*.csv', '*.svg', '*.root'])
     self.values = {}
     for column in utils.index_columns:
         if column == 'cdr3_length':  # kind of finicky to figure out what this is, so I don't always set it
             continue
         self.values[column] = {}
         if column in bool_columns:
             self.values[column]['right'] = 0
             self.values[column]['wrong'] = 0
     self.values['hamming_to_true_naive'] = {}
     for region in utils.regions:
         self.values[region + '_hamming_to_true_naive'] = {}
         self.values[region + '_hamming_to_true_naive_normed'] = {}
     # for bound in utils.boundaries:
     #     self.counts[bound + '_insertion_content'] = {'A':0, 'C':0, 'G':0, 'T':0}  # base content of each insertion
     # self.counts['seq_content'] = {'A':0, 'C':0, 'G':0, 'T':0}
     # n_bins, xmin, xmax = 100, 0.0, 1.0
     self.hists = {}
     self.hists['mute_freqs'] = Hist(30, -0.05, 0.05)
Esempio n. 43
0
    def __init__(self, args, base_plotdir):
        self.base_plotdir = base_plotdir

        self.eps_to_skip = 1e-3
        print 'skipping eps %f' % self.eps_to_skip

        plot_types = ('transitions', 'emissions')
        for ptype in plot_types:
            plotdir = self.base_plotdir + '/' + ptype
            utils.prep_dir(plotdir, wildlings=['*.png', '*.svg'])

        if args.hmmdir != None:
            filelist = glob.glob(args.hmmdir + '/*.yaml')
        else:
            filelist = utils.get_arg_list(args.infiles)
        if len(filelist) == 0:
            raise Exception('zero files passed to modelplotter')
        for infname in filelist:
            gene_name = os.path.basename(infname).replace('.yaml', '')  # the sanitized name, actually
            with open(infname) as infile:
                model = yaml.load(infile)
                self.make_transition_plot(gene_name, model)
                self.make_emission_plot(gene_name, model)
Esempio n. 44
0
    def write_vdjalign_input(self, base_infname):
        # first make a list of query names so we can iterate over an ordered collection
        ordered_info = []
        for query_name in self.input_info:
            ordered_info.append(query_name)

        queries_per_proc = float(len(self.input_info)) / self.args.n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        if self.args.n_procs == 1:  # double check for rounding problems or whatnot
            assert n_queries_per_proc == len(self.input_info)
        for iproc in range(self.args.n_procs):
            workdir = self.args.workdir
            if self.args.n_procs > 1:
                workdir += '/sw-' + str(iproc)
                utils.prep_dir(workdir)
            infname = workdir + '/' + base_infname
            with opener('w')(workdir + '/' + base_infname) as sub_infile:
                for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc):
                    if iquery >= len(ordered_info):
                        break
                    query_name = ordered_info[iquery]
                    sub_infile.write('>' + str(query_name) + ' NUKES\n')
                    sub_infile.write(self.input_info[query_name]['seq'] + '\n')
Esempio n. 45
0
    def write_hmms(self, parameter_dir, sw_matches):
        print 'writing hmms with info from %s' % parameter_dir
        start = time.time()
        from hmmwriter import HmmWriter
        hmm_dir = parameter_dir + '/hmms'
        utils.prep_dir(hmm_dir, '*.yaml')

        gene_list = self.args.only_genes
        if gene_list == None:  # if specific genes weren't specified, do the ones for which we have matches
            gene_list = []
            for region in utils.regions:
                for gene in self.germline_seqs[region]:
                    if sw_matches == None or gene in sw_matches:  # shouldn't be None really, but I'm testing something
                        gene_list.append(gene)

        for gene in gene_list:
            if self.args.debug:
                print '  %s' % utils.color_gene(gene)
            writer = HmmWriter(parameter_dir, hmm_dir, gene, self.args.naivety,
                               self.germline_seqs[utils.get_region(gene)][gene],
                               self.args)
            writer.write()

        print '    time to write hmms: %.3f' % (time.time()-start)
Esempio n. 46
0
    def __init__(self, args, base_plotdir):
        self.base_plotdir = base_plotdir

        self.eps_to_skip = 1e-3
        print 'skipping eps %f' % self.eps_to_skip

        plot_types = ('transitions', 'emissions')
        for ptype in plot_types:
            plotdir = self.base_plotdir + '/' + ptype
            utils.prep_dir(plotdir, wildlings=['*.png', '*.svg'])

        if args.hmmdir != None:
            filelist = glob.glob(args.hmmdir + '/*.yaml')
        else:
            filelist = utils.get_arg_list(args.infiles)
        if len(filelist) == 0:
            raise Exception('zero files passed to modelplotter')
        for infname in filelist:
            gene_name = os.path.basename(infname).replace(
                '.yaml', '')  # the sanitized name, actually
            with open(infname) as infile:
                model = yaml.load(infile)
                self.make_transition_plot(gene_name, model)
                self.make_emission_plot(gene_name, model)
    def write_hmms(self, parameter_dir, sw_matches):
        print 'writing hmms with info from %s' % parameter_dir
        start = time.time()
        from hmmwriter import HmmWriter
        hmm_dir = parameter_dir + '/hmms'
        utils.prep_dir(hmm_dir, '*.yaml')

        gene_list = self.args.only_genes
        if gene_list == None:  # if specific genes weren't specified, do the ones for which we have matches
            gene_list = []
            for region in utils.regions:
                for gene in self.germline_seqs[region]:
                    if sw_matches == None or gene in sw_matches:  # shouldn't be None really, but I'm testing something
                        gene_list.append(gene)

        for gene in gene_list:
            if self.args.debug:
                print '  %s' % utils.color_gene(gene)
            writer = HmmWriter(
                parameter_dir, hmm_dir, gene, self.args.naivety,
                self.germline_seqs[utils.get_region(gene)][gene], self.args)
            writer.write()

        print '    time to write hmms: %.3f' % (time.time() - start)
Esempio n. 48
0
    def write_vdjalign_input(self, base_infname):
        # first make a list of query names so we can iterate over an ordered collection
        ordered_info = []
        for query_name in self.input_info:
            ordered_info.append(query_name)

        queries_per_proc = float(len(self.input_info)) / self.args.n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        if self.args.n_procs == 1:  # double check for rounding problems or whatnot
            assert n_queries_per_proc == len(self.input_info)
        for iproc in range(self.args.n_procs):
            workdir = self.args.workdir
            if self.args.n_procs > 1:
                workdir += '/sw-' + str(iproc)
                utils.prep_dir(workdir)
            infname = workdir + '/' + base_infname
            with opener('w')(workdir + '/' + base_infname) as sub_infile:
                for iquery in range(iproc * n_queries_per_proc,
                                    (iproc + 1) * n_queries_per_proc):
                    if iquery >= len(ordered_info):
                        break
                    query_name = ordered_info[iquery]
                    sub_infile.write('>' + str(query_name) + ' NUKES\n')
                    sub_infile.write(self.input_info[query_name]['seq'] + '\n')
Esempio n. 49
0
    def split_input(self, n_procs, infname=None, info=None, prefix='sub'):
        """ 
        If <infname> is specified split the csv info from it into <n_procs> input files in subdirectories labelled with '<prefix>-' within <self.args.workdir>
        If <info> is specified, instead split the list <info> into pieces and return a list of the resulting lists
        """
        if info is None:
            assert infname is not None
            info = []
            with opener('r')(infname) as infile:
                reader = csv.DictReader(infile)
                for line in reader:
                    info.append(line)
        else:
            assert infname is None  # make sure only *one* of 'em is specified
            outlists = []
        queries_per_proc = float(len(info)) / n_procs
        n_queries_per_proc = int(math.ceil(queries_per_proc))
        for iproc in range(n_procs):
            if infname is None:
                outlists.append([])
            else:
                subworkdir = self.args.workdir + '/' + prefix + '-' + str(iproc)
                utils.prep_dir(subworkdir)
                sub_outfile = opener('w')(subworkdir + '/' + os.path.basename(infname))
                writer = csv.DictWriter(sub_outfile, reader.fieldnames)
                writer.writeheader()
            for iquery in range(iproc*n_queries_per_proc, (iproc + 1)*n_queries_per_proc):
                if iquery >= len(info):
                    break
                if infname is None:
                    outlists[-1].append(info[iquery])
                else:
                    writer.writerow(info[iquery])

        if infname is None:
            return outlists
Esempio n. 50
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        utils.prep_dir(plotdir + '/plots', multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region + '/plots', multilings=('*.csv', '*.svg'))
            utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))

        for gene in self.counts:
            counts, plotting_info = self.counts[gene], self.plotting_info[gene]
            sorted_positions = sorted(counts)
            hist = TH1D('hist_' + utils.sanitize_name(gene), '',
                        sorted_positions[-1] - sorted_positions[0] + 1,
                        sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5)
            for position in sorted_positions:
                hist.SetBinContent(hist.FindBin(position), counts[position]['freq'])
                hi_diff = abs(counts[position]['freq'] - counts[position]['freq_hi_err'])
                lo_diff = abs(counts[position]['freq'] - counts[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                hist.SetBinError(hist.FindBin(position), err)
            plotfname = plotdir + '/' + utils.get_region(gene) + '/plots/' + utils.sanitize_name(gene) + '.svg'
            xline = None
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]['cysteine-position']
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = int(tryp_positions[gene])
            plotting.draw(hist, 'int', plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, draw_str='e')  #, cwidth=4000, cheight=1000)
            paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)

        # for region in utils.regions:
        #     utils.prep_dir(plotdir + '/' + region + '/tmp/plots', multilings=('*.csv', '*.svg'))
        # for gene in self.tmpcounts:
        #     for position in self.tmpcounts[gene]:
        #         roothist = plotting.make_hist_from_my_hist_class(self.tmpcounts[gene][position]['muted'], gene + '_' + str(position))
        #         plotting.draw(roothist, 'int', plotdir=plotdir + '/' + utils.get_region(gene) + '/tmp', plotname=utils.sanitize_name(gene) + '_' + str(position), errors=True, write_csv=True)  #, cwidth=4000, cheight=1000)

        # make mean mute freq hists
        hist = plotting.make_hist_from_my_hist_class(self.mean_rates['all'], 'all-mean-freq')
        plotting.draw(hist, 'float', plotname='all-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        for region in utils.regions:
            hist = plotting.make_hist_from_my_hist_class(self.mean_rates[region], region+'-mean-freq')
            plotting.draw(hist, 'float', plotname=region+'-mean-freq', plotdir=plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True)
        check_call(['./bin/makeHtml', plotdir, '3', 'null', 'svg'])

        # then write html file and fix permissiions
        for region in utils.regions:
            check_call(['./bin/makeHtml', plotdir + '/' + region, '1', 'null', 'svg'])
            check_call(['./bin/makeHtml', plotdir + '/' + region + '-per-base', '1', 'null', 'png'])
        check_call(['./bin/permissify-www', plotdir])  # NOTE this should really permissify starting a few directories higher up
Esempio n. 51
0
    def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False):
        if not self.finalized:
            self.finalize()

        plotdir = base_plotdir + '/mute-freqs'
        overall_plotdir = plotdir + '/overall'
        utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg'))
        for region in utils.regions:
            utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg'))
            # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png'))
        if self.tigger:
            utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg'))

        for gene in self.freqs:
            freqs = self.freqs[gene]
            sorted_positions = sorted(freqs.keys())
            genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme')  #, title=utils.sanitize_name(gene))
            for position in sorted_positions:
                hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err'])
                lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err'])
                err = 0.5*(hi_diff + lo_diff)
                genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err)
            xline = None
            figsize = [3, 3]
            if utils.get_region(gene) == 'v' and cyst_positions is not None:
                xline = cyst_positions[gene]
                figsize[0] *= 3.5
            elif utils.get_region(gene) == 'j' and tryp_positions is not None:
                xline = tryp_positions[gene]
                figsize[0] *= 2
            plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv)
            # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info)  # needs translation to mpl

        # make mean mute freq hists
        plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)
        for region in utils.regions:
            plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv)

        if self.tigger:
            self.tigger_plot(only_csv)

        if not only_csv:  # write html file and fix permissiions
            plotting.make_html(overall_plotdir)
            for region in utils.regions:
                plotting.make_html(plotdir + '/' + region, n_columns=1)
    def __init__(self, args, seed, sublabel=None, total_length_from_right=-1):
        self.args = args

        if sublabel == None:
            self.workdir = self.args.workdir + '/recombinator'
            self.outfname = self.args.outfname
        else:  # need a separate workdir for each subprocess
            self.workdir = self.args.workdir + '/recombinator-' + sublabel
            self.outfname = self.workdir + '/' + os.path.basename(
                self.args.outfname)

        utils.prep_dir(self.workdir)
        if not os.path.exists(self.args.parameter_dir):
            raise Exception('ERROR ' + self.args.parameter_dir + ' d.n.e')

        # parameters that control recombination, erosion, and whatnot
        self.total_length_from_right = total_length_from_right  # measured from right edge of j, only write to file this much of the sequence (our read lengths are 130 by this def'n a.t.m.)

        self.all_seqs = {}  # all the Vs, all the Ds...
        self.index_keys = {
        }  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.version_freq_table = {
        }  # list of the probabilities with which each VDJ combo appears in data
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        # first read info that doesn't depend on which person we're looking at
        self.all_seqs = utils.read_germlines(self.args.datadir)
        with opener('r')(
                self.args.datadir + '/v-meta.json'
        ) as json_file:  # get location of <begin> cysteine in each v region
            self.cyst_positions = json.load(json_file)
        with opener('r')(
                self.args.datadir + '/j_tryp.csv'
        ) as csv_file:  # get location of <end> tryptophan in each j region (TGG)
            tryp_reader = csv.reader(csv_file)
            self.tryp_positions = {
                row[0]: row[1]
                for row in tryp_reader
            }  # WARNING: this doesn't filter out the header line

        # then read stuff that's specific to each person
        self.read_vdj_version_freqs(self.args.parameter_dir + '/' +
                                    utils.get_parameter_fname('all'))
        self.read_insertion_content()
        if self.args.naivety == 'M':  # read shm info if non-naive is requested
            # NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
            with opener('r')(
                    self.args.gtrfname) as gtrfile:  # read gtr parameters
                reader = csv.DictReader(gtrfile)
                for line in reader:
                    parameters = line['parameter'].split('.')
                    region = parameters[0][3].lower()
                    assert region == 'v' or region == 'd' or region == 'j'
                    model = parameters[1].lower()
                    parameter_name = parameters[2]
                    assert model in self.mute_models[region]
                    self.mute_models[region][model][parameter_name] = line[
                        'value']
            treegen = treegenerator.TreeGenerator(args,
                                                  self.args.parameter_dir,
                                                  seed=seed)
            self.treefname = self.workdir + '/trees.tre'
            treegen.generate_trees(seed, self.treefname)
            with opener('r')(
                    self.treefname
            ) as treefile:  # read in the trees (and other info) that we just generated
                self.treeinfo = treefile.readlines()
            if not self.args.no_clean:
                os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(
                self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))
Esempio n. 53
0
 def clean_plots(self, plotdir):
     for substr in self.subplotdirs:
         utils.prep_dir(plotdir + '/' + substr,
                        wildlings=('*.csv', '*.svg'))
Esempio n. 54
0
                    default='partis')
parser.add_argument('--overwrite', action='store_true')
parser.add_argument('--igbdir', default='./packages/ncbi-igblast-1.6.1/bin')
parser.add_argument('--glfo-dir')
parser.add_argument('--simulation-germline-dir')
parser.add_argument('--locus', default='igh')
parser.add_argument('--region', default='v')
parser.add_argument('--species', default='human')
parser.add_argument('--slurm', action='store_true')
parser.add_argument('--changeo-path', default=os.getenv('HOME') + '/.local')
parser.add_argument('--condapath',
                    default=os.getenv('HOME') + '/miniconda3/bin')
args = parser.parse_args()

# ----------------------------------------------------------------------------------------
outdir = os.path.dirname(
    args.outfname
)  # kind of annoying having <args.workdir> and <outdir>, but the former is for stuff we don't want to keep (not much...  maybe just .cmd file), and the latter is for stuff we do
assert outdir.split('/')[-1] == args.locus
outdir = outdir.rstrip('/' + args.locus)

utils.prep_dir(args.workdir, wildlings=['*.cmd', '*.fa', '*.sh'])  #'*.fmt7'])
utils.prep_dir(outdir, allow_other_files=True)

initialize_germline_info(outdir)  # deal with igblast germline crap
outfname = run_alignment(
    args, outdir)  # get the alignments, either with igblast or partis
run_tigger(outfname, args.outfname, outdir)

os.rmdir(args.workdir)
Esempio n. 55
0
    utils.print_reco_event(annotations[uid])

n_above_cutoff = len(
    [_ for cfo in chfo.values() if cfo['max_abs_diff'] > args.cutoff])
chimeric_fraction = n_above_cutoff / float(len(chfo))
print '  %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo),
                                                  chimeric_fraction)

hmaxval = Hist(45, 0., 0.65)
for uid in annotations:
    hmaxval.fill(chfo[uid]['max_abs_diff'])
himax = Hist(75, 0., 400)
for uid in annotations:
    himax.fill(chfo[uid]['imax'])

utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])

import matplotlib
from matplotlib import pyplot as plt
fig, ax = plotting.mpl_init()
xvals, yvals = zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()])
plt.scatter(xvals, yvals, alpha=0.4)

print 'writing to %s' % args.plotdir
plotting.mpl_finish(ax,
                    args.plotdir,
                    'hexbin',
                    title=args.title,
                    xlabel='break point',
                    ylabel='abs mfreq diff')
Esempio n. 56
0
    def __init__(
        self, args, glfo, seed, workdir, outfname
    ):  # NOTE <gldir> is not in general the same as <args.initial_germline_dir> # rm workdir
        self.args = args
        self.glfo = glfo

        # NOTE in general *not* the same as <self.args.workdir> and <self.args.outfname>
        self.workdir = tempfile.mkdtemp()
        self.outfname = outfname
        utils.prep_dir(self.workdir)

        # set <self.parameter_dir> (note that this is in general *not* the same as self.args.parameter_dir)
        if self.args.rearrange_from_scratch:  # currently not allowed to mutate from scratch without also rearranging from scratch (enforced in bin/partis)
            if self.args.mutate_from_scratch:
                self.parameter_dir = None
            else:
                self.parameter_dir = self.args.scratch_mute_freq_dir  # if you make up mute freqs from scratch, unless you're really careful you tend to get nonsense results for a lot of things (e.g. allele finding). So it's easier to copy over a reasonable set of mut freq parameters from somewhere.
        else:
            self.parameter_dir = self.args.parameter_dir + '/' + self.args.parameter_type

        self.index_keys = {
        }  # this is kind of hackey, but I suspect indexing my huge table of freqs with a tuple is better than a dict
        self.mute_models = {}
        # self.treeinfo = []  # list of newick-formatted tree strings with region-specific branch info tacked at the end
        for region in utils.regions:
            self.mute_models[region] = {}
            for model in ['gtr', 'gamma']:
                self.mute_models[region][model] = {}

        self.allele_prevalence_freqs = glutils.read_allele_prevalence_freqs(
            args.allele_prevalence_fname
        ) if args.allele_prevalence_fname is not None else {}
        self.version_freq_table = self.read_vdj_version_freqs(
        )  # list of the probabilities with which each VDJ combo (plus other rearrangement parameters) appears in data (none if rearranging from scratch)
        self.insertion_content_probs = self.read_insertion_content(
        )  # dummy/uniform if rearranging from scratch
        self.all_mute_freqs = {}

        # read shm info NOTE I'm not inferring the gtr parameters a.t.m., so I'm just (very wrongly) using the same ones for all individuals
        with open(self.args.gtrfname, 'r') as gtrfile:  # read gtr parameters
            reader = csv.DictReader(gtrfile)
            for line in reader:
                parameters = line['parameter'].split('.')
                region = parameters[0][3].lower()
                assert region == 'v' or region == 'd' or region == 'j'
                model = parameters[1].lower()
                parameter_name = parameters[2]
                assert model in self.mute_models[region]
                self.mute_models[region][model][parameter_name] = line['value']
        treegen = treegenerator.TreeGenerator(args,
                                              self.parameter_dir,
                                              seed=seed)
        self.treefname = self.workdir + '/trees.tre'
        treegen.generate_trees(
            seed, self.treefname
        )  # NOTE not really a newick file, since I hack on the per-region branch length info at the end of each line
        with open(
                self.treefname, 'r'
        ) as treefile:  # read in the trees (and other info) that we just generated
            self.treeinfo = treefile.readlines()
        os.remove(self.treefname)

        if os.path.exists(self.outfname):
            os.remove(self.outfname)
        elif not os.path.exists(os.path.dirname(os.path.abspath(
                self.outfname))):
            os.makedirs(os.path.dirname(os.path.abspath(self.outfname)))

        self.validation_values = {
            'heights':
            {t: {
                'in': [],
                'out': []
            }
             for t in ['all'] + utils.regions}
        }
Esempio n. 57
0
def get_gls_gen_annotation_performance_plots(args, region, baseoutdir):
    assert region == 'v'  # needs to be implemented
    import plotting
    import plotconfig
    methcolors = {  # NOTE started from scolors in bin/plot-gl-set-trees.py htmlcolorcods.com, and slide each one a little rightward
        'tigger-default' : '#dd4d39',
        'igdiscover' : '#55ab7a', #60ac84',
        'partis' : '#6b83ca', #758bcd',
        'full' : '#858585',
    }
    lstyledict = {}  # 'tigger-default' : '--'}
    lwdict = {'full' : 9, 'igdiscover' : 8, 'partis' : 5, 'tigger-default' : 2}  # methods are sorted below, so it's always [full, igdiscover, partis, tigger]
    linewidths = [lwdict[m] for m in args.methods]
    colors = [methcolors[meth] for meth in args.methods]
    linestyles = [lstyledict.get(m, '-') for m in args.methods]
    alphas = [0.8 if m in ['full', 'igdiscover'] else 1 for m in args.methods]

    varname = args.action
    varval = 'simu'
    plotnames = ['v_hamming_to_true_naive', 'v_muted_bases']
    xtitles = ['V distance to true naive', 'inferred - true']
    meanvals = {pn : {m : [] for m in args.methods} for pn in plotnames}
    print '  annotations: %s' % get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events)
    all_hists = {pn : [] for pn in plotnames}
    for iproc in range(args.iteststart, args.n_tests):
        outdir = get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events) + '/' + str(iproc)  # duplicates code in bin/test-germline-inference.py
        plotdir = outdir + '/annotation-performance-plots'
        print '    %s' % plotdir
        if not args.only_print:
            utils.prep_dir(plotdir, wildlings=['*.png', '*.svg', '*.csv'])

        # shenanigans for the six (three easy and thre hard) of 'em that go in the paper pdf
        make_legend = (iproc > 2) or (iproc == 0) # and args.gls_gen_difficulty == 'easy')
        make_xtitle = (iproc > 2) or (iproc == 2)
        make_ytitle = (iproc > 2) or (args.gls_gen_difficulty == 'easy')

        for plotname in plotnames:
            hfnames = {meth : get_gls_fname(region, outdir, meth, args.locus, annotation_performance_plots=True) for meth in args.methods}
            for hfn in hfnames.values():
                if not os.path.exists(hfn):
                    raise Exception('%s d.n.e.: need to first run non-plotting (without --plot) --annotation-performance-plots (which involves re-running partis, I think the difference being partis is now running with --plot-annotation-performance' % hfn)
            hists = {meth : Hist(fname=hfnames[meth] + '/' + plotname + '.csv', title=methstr(meth) if make_legend else None) for meth in args.methods}
            for meth in args.methods:
                if hists[meth].overflow_contents() != 0.0:
                    print '  %s %s non-zero under/overflow %f' % (utils.color('red', 'error'), methstr(meth), hists[meth].overflow_contents())
                meanvals[plotname][meth].append(hists[meth].get_mean())
            if args.only_print:
                continue
            plotting.draw_no_root(hists[args.methods[0]], log='y', plotdir=plotdir, plotname=plotname, more_hists=[hists[m] for m in args.methods[1:]], colors=colors, ytitle='sequences' if make_ytitle else None,
                                  xtitle=xtitles[plotnames.index(plotname)] if make_xtitle else '',
                                  plottitle=gls_sim_str(args.gls_gen_difficulty, iproc),
                                  linewidths=linewidths, linestyles=linestyles, alphas=alphas, remove_empty_bins=True, square_bins=True)
            all_hists[plotname].append(hists)

    print '  total plots'
    plotdir = get_outdir(args, baseoutdir, varname, varval, n_events=args.gls_gen_events) + '/annotation-performance-plots'
    print '    %s' % plotdir
    if not args.only_print:
        utils.prep_dir(plotdir, wildlings=['*.png', '*.svg', '*.csv'])
    for plotname in plotnames:
        total_hists = {}
        for meth in args.methods:
            xmin = min([hdict[meth].xmin for hdict in all_hists[plotname]])
            xmax = max([hdict[meth].xmax for hdict in all_hists[plotname]])
            total_hists[meth] = Hist(xmax - xmin, xmin, xmax, title=all_hists[plotname][0][meth].title)
            for hdict in all_hists[plotname]:
                assert hdict[meth].integral(include_overflows=True) > 100  # make sure it isn't normalized (this is a shitty way to do this)
                bin_centers = hdict[meth].get_bin_centers()
                for ibin in range(len(hdict[meth].low_edges)):
                    xval = bin_centers[ibin]
                    for _ in range(int(hdict[meth].bin_contents[ibin])):
                        total_hists[meth].fill(xval)
        plotting.draw_no_root(total_hists[args.methods[0]], log='y', plotdir=plotdir, plotname='total-' + plotname, more_hists=[total_hists[m] for m in args.methods[1:]], colors=colors, ytitle='sequences' if make_ytitle else None,
                              xtitle=xtitles[plotnames.index(plotname)],
                              plottitle=gls_sim_str(args.gls_gen_difficulty, iproc=''),
                              linewidths=linewidths, linestyles=linestyles, alphas=alphas, remove_empty_bins=True, square_bins=True)

    for plotname in plotnames:
        if 'muted_bases' in plotname:  #  mean value isn't meaningful
            continue
        print plotname
        for meth in args.methods:
            mean = float(sum(meanvals[plotname][meth])) / len(meanvals[plotname][meth])
            err = numpy.std(meanvals[plotname][meth], ddof=1) / math.sqrt(len(meanvals[plotname][meth]))
            print '   %15s  %6.3f / %d = %6.2f +/- %6.2f' % (methstr(meth), sum(meanvals[plotname][meth]), len(meanvals[plotname][meth]), mean, err)
Esempio n. 58
0
 def clean_plots(self, plotdir):
     for substr in ['overall', ] + utils.regions:  # + [r + '-per-base' for r in utils.regions]:
         utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg'))