def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) if not only_csv: plotting.make_html(plotdir)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best' : plotting.get_cluster_size_hist(partition)} self.plot_within_vs_between_hists(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append(plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best' : plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, only_csv=False): print ' plotting performance', import fraction_uncertainty import plotting start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in plotconfig.gene_usage_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, only_csv=False): print ' plotting performance', start = time.time() for substr in self.subplotdirs: utils.prep_dir(plotdir + '/' + substr, wildlings=('*.csv', '*.svg')) for column in self.values: if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] lo, hi, _ = fraction_uncertainty.err(right, right + wrong) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir + '/gene-call', write_csv=True, stats='0-bin', only_csv=only_csv) else: hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=False) if 'hamming_to_true_naive' in column: xtitle = 'hamming distance' tmpplotdir = plotdir + '/mutation' else: xtitle = 'inferred - true' if 'muted' in column: tmpplotdir = plotdir + '/mutation' else: tmpplotdir = plotdir + '/boundaries' plotting.draw_no_root(hist, plotname=column, plotdir=tmpplotdir, write_csv=True, only_csv=only_csv, xtitle=xtitle, shift_overflows=True) for column in self.hists: if '_vs_mute_freq' in column or '_vs_per_gene_support' in column: # only really care about the fraction, which we plot below continue plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir + '/mutation', write_csv=True, only_csv=only_csv, ytitle='counts', xtitle='inferred - true', shift_overflows=True) # fraction correct vs mute freq for region in utils.regions: hright = self.hists[region + '_gene_right_vs_mute_freq'] hwrong = self.hists[region + '_gene_wrong_vs_mute_freq'] if hright.integral(include_overflows=True) == 0: continue plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_fraction_correct_vs_mute_freq', xlabel='mut freq', ylabel='fraction correct up to allele', xbounds=(0., 0.5), only_csv=only_csv, write_csv=True) # per-gene support stuff for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue hright = self.hists[region + '_allele_right_vs_per_gene_support'] hwrong = self.hists[region + '_allele_wrong_vs_per_gene_support'] plotting.make_fraction_plot(hright, hwrong, plotdir + '/gene-call', region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction with correct allele', xbounds=(-0.1, 1.1), only_csv=only_csv, write_csv=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr, n_columns=4) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, partition=None, infiles=None, annotations=None, only_csv=None): import plotting print ' plotting partitions' sys.stdout.flush() start = time.time() for subdir in self.subplotdirs: utils.prep_dir(plotdir + '/' + subdir, wildlings=['*.csv', '*.svg']) fnames = [] if partition is not None: # one partition assert infiles is None assert annotations is not None csize_hists = {'best': plotting.get_cluster_size_hist(partition)} # self.plot_within_vs_between_hists(partition, annotations, plotdir) fnames += self.plot_size_vs_shm(partition, annotations, plotdir) elif infiles is not None: # plot the mean of a partition from each file subset_hists = [] for fname in infiles: cp = ClusterPath() cp.readfile(fname) subset_hists.append( plotting.get_cluster_size_hist(cp.partitions[cp.i_best])) csize_hists = {'best': plotting.make_mean_hist(subset_hists)} for ih in range(len(subset_hists)): subset_hists[ih].write(plotdir + ('/subset-%d-cluster-sizes.csv' % ih)) else: assert False plotting.plot_cluster_size_hists(plotdir + '/overall/cluster-sizes.svg', csize_hists, title='', log='x') fnames.append(['cluster-sizes.svg']) if not only_csv: for subdir in self.subplotdirs: plotting.make_html(plotdir + '/' + subdir, fnames=fnames, new_table_each_row=True) print '(%.1f sec)' % (time.time() - start)
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + '/mute-freqs', only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[1] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = 'string' if column in self.string_columns else 'int' hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root(hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, only_csv=False, only_overall=False): if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color('red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[self.glfo['chain']]: codon = utils.conserved_codons[self.glfo['chain']][utils.get_region(gene)] xline = self.glfo[codon + '-positions'][gene] if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr+'_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr+'_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def compare_directories(args, plotdirlist, outdir): utils.prep_dir(outdir, wildlings=['*.png', '*.svg', '*.csv']) # read hists from <plotdirlist> allhists = OrderedDict() allvars = set() # all variables that appeared in any dir for idir in range(len(plotdirlist)): dirhists = get_hists_from_dir(plotdirlist[idir], args.names[idir]) allvars |= set(dirhists.keys()) allhists[args.names[idir]] = dirhists # then loop over all the <varname>s we found for varname in allvars: hlist = [allhists[dname].get(varname, Hist(1, 0, 1, title='null')) for dname in allhists] plot_single_variable(args, varname, hlist, outdir, pathnameclues=plotdirlist[0]) plotting.make_html(outdir, n_columns=4)
def plot(self, base_plotdir, cyst_positions=None, tryp_positions=None, only_csv=False): if not self.finalized: self.finalize() plotdir = base_plotdir + '/mute-freqs' overall_plotdir = plotdir + '/overall' utils.prep_dir(overall_plotdir, multilings=('*.csv', '*.svg')) for region in utils.regions: utils.prep_dir(plotdir + '/' + region, multilings=('*.csv', '*.svg')) # utils.prep_dir(plotdir + '/' + region + '-per-base/plots', multilings=('*.csv', '*.png')) if self.tigger: utils.prep_dir(plotdir + '/tigger', multilings=('*.csv', '*.svg')) for gene in self.freqs: freqs = self.freqs[gene] sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='fixme', ytitle='fixme') #, title=utils.sanitize_name(gene)) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5*(hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [3, 3] if utils.get_region(gene) == 'v' and cyst_positions is not None: xline = cyst_positions[gene] figsize[0] *= 3.5 elif utils.get_region(gene) == 'j' and tryp_positions is not None: xline = tryp_positions[gene] figsize[0] *= 2 plotting.draw_no_root(genehist, plotdir=plotdir + '/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv) # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl # make mean mute freq hists plotting.draw_no_root(self.mean_rates['all'], plotname='all-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) for region in utils.regions: plotting.draw_no_root(self.mean_rates[region], plotname=region+'-mean-freq', plotdir=overall_plotdir, stats='mean', bounds=(0.0, 0.4), write_csv=True, only_csv=only_csv) if self.tigger: self.tigger_plot(only_csv) if not only_csv: # write html file and fix permissiions plotting.make_html(overall_plotdir) for region in utils.regions: plotting.make_html(plotdir + '/' + region, n_columns=1)
def plot(self, plotdir, subset_by_gene=False, cyst_positions=None, tryp_positions=None, only_csv=False): print ' plotting parameters', sys.stdout.flush() start = time.time() self.clean_plots(plotdir, subset_by_gene) self.mfreqer.plot(plotdir + '/mute-freqs', cyst_positions, tryp_positions, only_csv=only_csv) #, mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv') # REGION is replace by each region in the three output files overall_plotdir = plotdir + '/overall' for column in self.counts: if column == 'all': continue values, gene_values = {}, {} if len(self.counts[column]) == 0: raise Exception('no counts in %s' % column) for index, count in self.counts[column].iteritems(): gene = None if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene if '_del' in column: region = column[0] else: region = column[1] assert region in utils.regions assert 'IGH' + region.upper() in index[1] # NOTE this is hackey, but it works find now and will fail obviously gene = index[1] # if I ever change the correlations to be incompatible. so screw it if gene not in gene_values: gene_values[gene] = {} column_val = index[0] if gene is not None: if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count if column_val not in values: values[column_val] = 0.0 values[column_val] += count try: # figure out whether this is an integer or string (only used outside this loop when we make the plots) int(column_val) var_type = 'int' except: var_type = 'string' if subset_by_gene and ('_del' in column or column == 'vd_insertion' or column == 'dj_insertion'): # option to subset deletion and (real) insertion plots by gene thisplotdir = plotdir + '/' + column for gene in gene_values: plotname = utils.sanitize_name(gene) + '-' + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=thisplotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(thisplotdir) plotname = column hist = plotting.make_hist_from_dict_of_counts(values, var_type, plotname, sort=True) plotting.draw_no_root(hist, plotname=plotname, plotdir=overall_plotdir, errors=True, write_csv=True, only_csv=only_csv) if not only_csv: plotting.make_html(overall_plotdir) print '(%.1f sec)' % (time.time()-start)
def plot(self, plotdir, only_csv=False, only_overall=False): import plotting if not self.finalized: self.finalize() overall_plotdir = plotdir + '/overall' for gene in self.freqs: if only_overall: continue freqs = self.freqs[gene] if len(freqs) == 0: if gene not in glutils.dummy_d_genes.values(): print ' %s no mutefreqer obs for %s' % (utils.color( 'red', 'warning'), utils.color_gene(gene)) continue sorted_positions = sorted(freqs.keys()) genehist = Hist(sorted_positions[-1] - sorted_positions[0] + 1, sorted_positions[0] - 0.5, sorted_positions[-1] + 0.5, xtitle='position', ytitle='mut freq', title=gene) for position in sorted_positions: hi_diff = abs(freqs[position]['freq'] - freqs[position]['freq_hi_err']) lo_diff = abs(freqs[position]['freq'] - freqs[position]['freq_lo_err']) err = 0.5 * (hi_diff + lo_diff) genehist.set_ibin(genehist.find_bin(position), freqs[position]['freq'], error=err) xline = None figsize = [7, 4] if utils.get_region(gene) in utils.conserved_codons[ self.glfo['locus']]: xline = utils.cdn_pos(self.glfo, utils.get_region(gene), gene) if utils.get_region(gene) == 'v': figsize[0] *= 3.5 elif utils.get_region(gene) == 'j': figsize[0] *= 2 plotting.draw_no_root(self.per_gene_mean_rates[gene], plotdir=plotdir + '/per-gene/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, only_csv=only_csv, shift_overflows=True) # per-position plots: plotting.draw_no_root(genehist, plotdir=plotdir + '/per-gene-per-position/' + utils.get_region(gene), plotname=utils.sanitize_name(gene), errors=True, write_csv=True, xline=xline, figsize=figsize, only_csv=only_csv, shift_overflows=True) # # per-position, per-base plots: # paramutils.make_mutefreq_plot(plotdir + '/' + utils.get_region(gene) + '-per-base', utils.sanitize_name(gene), plotting_info) # needs translation to mpl UPDATE fcn is fixed, but I can't be bothered uncommenting this at the moment # make mean mute freq hists for rstr in ['all', 'cdr3'] + utils.regions: if rstr == 'all': bounds = (0.0, 0.4) else: bounds = (0.0, 0.6 if rstr == 'd' else 0.4) plotting.draw_no_root(self.mean_rates[rstr], plotname=rstr + '_mean-freq', plotdir=overall_plotdir, stats='mean', bounds=bounds, write_csv=True, only_csv=only_csv, shift_overflows=True) plotting.draw_no_root(self.mean_n_muted[rstr], plotname=rstr + '_mean-n-muted', plotdir=overall_plotdir, stats='mean', write_csv=True, only_csv=only_csv, shift_overflows=True) if not only_csv: # write html file and fix permissiions for substr in self.subplotdirs: plotting.make_html(plotdir + '/' + substr)
def plot(self, plotdir, only_csv=False): utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root']) for column in self.values: if self.only_correct_gene_fractions and column not in bool_columns: continue if column in bool_columns: right = self.values[column]['right'] wrong = self.values[column]['wrong'] errs = fraction_uncertainty.err(right, right+wrong) print ' %s\n correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1]) hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column) plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv) else: # TODO this is dumb... I should make the integer-valued ones histograms as well hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True) log = '' if column.find('hamming_to_true_naive') >= 0: # TODO why doesn't this just use the config dicts in plotheaders or wherever? hist.title = 'hamming distance' else: hist.title = 'inferred - true' plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) for column in self.hists: plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv) # per-gene support crap for region in utils.regions: if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0: continue xvals = self.hists[region + '_allele_right_vs_per_gene_support'].get_bin_centers() #ignore_overflows=True) right = self.hists[region + '_allele_right_vs_per_gene_support'].bin_contents wrong = self.hists[region + '_allele_wrong_vs_per_gene_support'].bin_contents yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)] # remove values corresponding to bins with no entries while yvals.count(0.) > 0: iv = yvals.index(0.) xvals.pop(iv) right.pop(iv) wrong.pop(iv) yvals.pop(iv) tmphilos = [fraction_uncertainty.err(r, r + w) for r, w in zip(right, wrong)] yerrs = [err[1] - err[0] for err in tmphilos] # fitting a line isn't particularly informative, actually # params, cov = numpy.polyfit(xvals, yvals, 1, w=[1./(e*e) if e > 0. else 0. for e in yerrs], cov=True) # slope, slope_err = params[0], math.sqrt(cov[0][0]) # y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1]) # print '%s slope: %5.2f +/- %5.2f y-intercept: %5.2f +/- %5.2f' % (region, slope, slope_err, y_icpt, y_icpt_err) # print '%s' % region # for iv in range(len(xvals)): # print ' %5.2f %5.0f / %5.0f = %5.2f +/- %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv]) fig, ax = plotting.mpl_init() ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.') ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3) # line with slope 1 and intercept 0 # linevals = [slope*x + y_icpt for x in [0] + xvals] # fitted line # ax.plot([0] + xvals, linevals) plotting.mpl_finish(ax, plotdir, region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction correct', xbounds=(-0.1, 1.1), ybounds=(-0.1, 1.1)) if not only_csv: plotting.make_html(plotdir)
def plot(self, plotdir, only_csv=False, only_overall=False): print " plotting parameters", sys.stdout.flush() start = time.time() self.clean_plots(plotdir) self.mfreqer.plot(plotdir + "/mute-freqs", only_csv=only_csv, only_overall=only_overall) overall_plotdir = plotdir + "/overall" for column in self.counts: if column == "all": continue values, gene_values = {}, {} for index, count in self.counts[column].iteritems(): column_val = index[0] if column_val not in values: values[column_val] = 0.0 values[column_val] += count if column in self.columns_to_subset_by_gene: gene = index[ 1 ] # NOTE this is hackey, but it works find now and will fail obviously if I ever change the correlations to be incompatible. so screw it utils.split_gene(gene) # checks validity of gene if gene not in gene_values: gene_values[gene] = {} if column_val not in gene_values[gene]: gene_values[gene][column_val] = 0.0 gene_values[gene][column_val] += count var_type = "string" if column in self.string_columns else "int" hist = plotting.make_hist_from_dict_of_counts(values, var_type, column, sort=True) plotting.draw_no_root( hist, plotname=column, plotdir=overall_plotdir, xtitle=plotconfig.xtitles.get(column, column), plottitle=plotconfig.plot_titles.get(column, column), errors=True, write_csv=True, only_csv=only_csv, ) if column in self.columns_to_subset_by_gene and not only_overall: thisplotdir = plotdir + "/" + column for gene in gene_values: plotname = utils.sanitize_name(gene) + "-" + column hist = plotting.make_hist_from_dict_of_counts(gene_values[gene], var_type, plotname, sort=True) plotting.draw_no_root( hist, plotname=plotname, plotdir=thisplotdir, xtitle=plotconfig.plot_titles.get(column, column), plottitle=gene, errors=True, write_csv=True, only_csv=only_csv, ) if not only_csv: plotting.make_html(thisplotdir) if not only_csv: plotting.make_html(overall_plotdir) print "(%.1f sec)" % (time.time() - start)
args.glfo = tmpglfo else: args.glfo = glutils.get_merged_glfo(args.glfo, tmpglfo) listof_plotdirlists, listof_outdirs = [], [] # first add the main/parent dir, if it has csvs firstdir = args.plotdirs[0] if len(glob.glob(firstdir + '/*.csv')) > 0: listof_plotdirlists.append(args.plotdirs) listof_outdirs.append(args.outdir) else: print ' no csvs in main/parent dir %s' % firstdir # then figure out if there's subdirs we need to deal with added_subds = [] for subdir in [ d for d in os.listdir(firstdir) if os.path.isdir(firstdir + '/' + d) ]: listof_plotdirlists.append([d + '/' + subdir for d in args.plotdirs]) listof_outdirs.append(args.outdir + '/' + subdir) added_subds.append(subdir) if len(added_subds) > 0: print ' added %d subdirs: %s' % (len(added_subds), ' '.join(added_subds)) for dlist, outdir in zip(listof_plotdirlists, listof_outdirs): compare_directories(args, dlist, outdir) if args.make_parent_html: # didn't really test this very well fnoutstr, _ = utils.simplerun('find %s -type f -name *.svg' % args.outdir, return_out_err=True) plotting.make_html(args.outdir, fnames=[fnoutstr.strip().split('\n')])