def get_data_plots(args, baseoutdir, methods, study, dsets): metafos = heads.read_metadata(study) assert len(set([metafos[ds]['locus'] for ds in dsets ])) # make sure everybody has the same locus mfo = metafos[dsets[0]] data_outdirs = [ heads.get_datadir( study, 'processed', extra_str='gls-gen-paper-' + args.label) + '/' + ds for ds in dsets ] outdir = get_outdir( args, baseoutdir, varname='data', varval=study + '/' + '-vs-'.join(dsets) ) # for data, only the plots go here, since datascripts puts its output somewhere else if len(dsets) > 1 and len(methods) == 1: # sample vs sample glslabels = dsets title = get_dset_title([metafos[ds] for ds in dsets]) if study != 'kate-qrs': title += ' %s' % methstr(methods[0]) title_color = methods[0] legends = get_dset_legends([metafos[ds] for ds in dsets]) legend_title = methstr( methods[0] ) if study == 'kate-qrs' else None # for kate-qrs we need to put the subject _and_ the isotype in the title, so there's no room for the method pie_chart_faces = False print '%s:' % utils.color('green', methods[0]), elif len(methods) > 1 and len(dsets) == 1: # method vs method glslabels = methods title = get_dset_title([mfo]) title_color = None legends = [methstr(m) + ' only' for m in methods] legend_title = None pie_chart_faces = True print '%s:' % utils.color('green', dsets[0]), else: raise Exception('one of \'em has to be length 1: %d %d' % (len(methods), len(dsets))) print '%s' % (' %s ' % utils.color('light_blue', 'vs')).join(glslabels) make_gls_tree_plot(args, outdir + '/' + '-vs-'.join(methods) + '/gls-gen-plots', study + '-' + '-vs-'.join(dsets), glsfnames=[ get_gls_fname(ddir, meth, locus=mfo['locus'], data=True) for ddir in data_outdirs for meth in methods ], glslabels=glslabels, locus=mfo['locus'], title=title, title_color=title_color, legends=legends, legend_title=legend_title, pie_chart_faces=pie_chart_faces)
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo): method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset gls_dir = get_gls_dir(method_outdir, method, data=True) print ' %s --> %s' % (gls_dir, zenodo_dir) glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method) glutils.write_glfo(zenodo_dir, glfo) if method == 'partis': # allele finding plots plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding') if not os.path.exists(zenodo_dir + '/fits'): os.makedirs(zenodo_dir + '/fits') for genedir in glob.glob(plotdir + '/try-0/*'): # would be nice to copy html, but links will be wrong subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/']) # csv prevalence files for tmpreg in utils.regions: with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile: reader = csv.DictReader(infile) countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader} old_total = sum(countfo.values()) orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]] # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above for ogene in orf_genes: # if tmpreg == 'v': # _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene]) # oops, that's dumb... of course it isn't there # else: nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene) # print ' adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene)) countfo[nearest_gene] += countfo[ogene] for ogene in orf_genes: del countfo[ogene] assert old_total == sum(countfo.values()) with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile: writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count')) writer.writeheader() for gene in countfo: writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]}) elif method == 'tigger-default': # doesn't seem to have written anything pass elif method == 'igdiscover': # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']: # subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/']) subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/']) # aw, screw it, just write everything. The simulation stuff is already huge, anyway else: assert False
def get_data_plots(args, region, baseoutdir, methods, study, dsets): metafos = heads.read_metadata(study) assert len(set([metafos[ds]['locus'] for ds in dsets])) # make sure everybody has the same locus mfo = metafos[dsets[0]] data_outdirs = [heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + ds for ds in dsets] outdir = get_outdir(args, baseoutdir, varname='data', varval=study + '/' + '-vs-'.join(dsets)) # for data, only the plots go here, since datascripts puts its output somewhere else title, title_color, legends, legend_title = None, None, None, None pie_chart_faces = False if len(dsets) > 1 and len(methods) == 1: # sample vs sample glslabels = dsets title = get_dset_title([metafos[ds] for ds in dsets]) if study != 'kate-qrs': title += ' %s' % methstr(methods[0]) title_color = methods[0] legends = get_dset_legends([metafos[ds] for ds in dsets]) legend_title = methstr(methods[0]) if study == 'kate-qrs' else None # for kate-qrs we need to put the subject _and_ the isotype in the title, so there's no room for the method print '%s:' % utils.color('green', methods[0]), elif len(methods) > 1 and len(dsets) == 1: # method vs method glslabels = methods title = get_dset_title([mfo]) title_color = None legends = [methstr(m) + ' only' for m in methods] legend_title = None pie_chart_faces = len(methods) > 2 # True print '%s:' % utils.color('green', dsets[0]), else: # single sample plot glslabels = dsets print '%s' % (' %s ' % utils.color('light_blue', 'vs')).join(glslabels) plotdir = outdir + '/' + '-vs-'.join(methods) + '/gls-gen-plots' if args.all_regions: # NOTE not actually checking this by running... but it's the same as the gls-gen one, so it should be ok plotdir += '/' + region param_dirs = None if args.add_gene_counts_to_tree_plots: # this returns 'None' for non-partis methods, which is ok for now, but I think I do usually have the parameter dir somewhere if I've run the annotation performance stuff param_dirs = [get_param_dir(ddir, meth) for ddir in data_outdirs for meth in methods] make_gls_tree_plot(args, region, plotdir, study + '-' + '-vs-'.join(dsets), glsfnames=[get_gls_fname(region, ddir, meth, locus=mfo['locus'], data=True) for ddir in data_outdirs for meth in methods], glslabels=glslabels, locus=mfo['locus'], title=title, title_color=title_color, legends=legends, legend_title=legend_title, pie_chart_faces=pie_chart_faces, param_dirs=param_dirs)
def get_data_plots(args, baseoutdir, method): for var in args.varvals: study, dset = var.split('/') mfo = heads.read_metadata(study)[dset] data_outdir = heads.get_datadir( study, 'processed', extra_str='gls-gen-paper-' + args.label) + '/' + dset outdir = get_outdir( args, baseoutdir, varname='data', varval=study + '/' + dset ) # for data, only the plots go here, since datascripts puts its output somewhere else make_gls_tree_plot(args, outdir + '/' + method + '/gls-gen-plots', study + '-' + dset, glsfnames=[ get_gls_fname(data_outdir, method, locus=mfo['locus'], data=True) ], glslabels=['data'])
def print_data_table(dsetfos, method, latex=False, emph_genes=['IGHV1-2*02+G35A', 'IGHD3-10*01', 'IGHJ4*02', 'IGKV3-15*01', 'IGKJ3*01']): latex = True def getvalstr(gene, val): if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))): return '%s %5.2s %s %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '') else: if latex: gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5) if emph_genes is not None and gene in emph_genes: gstr = '\\color{red}{\\textbf{%s}}' % gstr else: gstr = utils.color_gene(gene, width=18) return '%s %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr) def print_line(rfos): print ' %s%s' % (' '.join([getvalstr(g, v) for g, v in rfos]), lstr) def ds_str(ds, region): lstr = ds.split('-')[1] return ('IG%s%s' % (('h' if lstr in ['g', 'm'] else lstr).upper(), region.upper())) if latex else ds cstr = '&' if latex else '' estr = '$' if latex else '' lstr = '\\\\' if latex else '' for region in utils.regions: param_dirs = [get_param_dir(heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset, method) for study, dset in dsetfos] countfos = [utils.read_overall_gene_probs(pdir, normalize=True)[region] for pdir in param_dirs] gene_val_str = (' %s ' % cstr).join([(' %s %s %-20s' % ('\\%' if latex else '', cstr, ds_str(ds, region))) for _, ds in dsetfos]) tmpline = ' %s %s %s' % (cstr, gene_val_str, lstr) if latex: hstr = '\\hline' tmpline = ' %s\n%s\n %s' % (hstr, tmpline, hstr) print tmpline rowfos = [sorted(cfo.items(), key=operator.itemgetter(1), reverse=True) for cfo in countfos] irow = 0 while True: rfos = [rfo[irow] if irow < len(rfo) else (None, None) for rfo in rowfos] if set(rfos) == set([(None, None)]): break print_line(rfos) irow += 1
def get_data_pair_plots(args, baseoutdir, method, study, dsets): mfo = heads.read_metadata(study)[dsets[0]] assert heads.read_metadata(study)[dsets[1]]['locus'] == mfo['locus'] data_outdirs = [ heads.get_datadir( study, 'processed', extra_str='gls-gen-paper-' + args.label) + '/' + ds for ds in dsets ] outdir = get_outdir( args, baseoutdir, varname='data', varval=study + '/' + '-vs-'.join(dsets) ) # for data, only the plots go here, since datascripts puts its output somewhere else make_gls_tree_plot(args, outdir + '/' + method + '/gls-gen-plots', study + '-' + '-vs-'.join(dsets), glsfnames=[ get_gls_fname(dout, method, locus=mfo['locus'], data=True) for dout in data_outdirs ], glslabels=dsets)