Esempio n. 1
0
def get_data_plots(args, baseoutdir, methods, study, dsets):
    metafos = heads.read_metadata(study)
    assert len(set([metafos[ds]['locus'] for ds in dsets
                    ]))  # make sure everybody has the same locus
    mfo = metafos[dsets[0]]
    data_outdirs = [
        heads.get_datadir(
            study, 'processed', extra_str='gls-gen-paper-' + args.label) +
        '/' + ds for ds in dsets
    ]
    outdir = get_outdir(
        args,
        baseoutdir,
        varname='data',
        varval=study + '/' + '-vs-'.join(dsets)
    )  # for data, only the plots go here, since datascripts puts its output somewhere else
    if len(dsets) > 1 and len(methods) == 1:  # sample vs sample
        glslabels = dsets
        title = get_dset_title([metafos[ds] for ds in dsets])
        if study != 'kate-qrs':
            title += '  %s' % methstr(methods[0])
        title_color = methods[0]
        legends = get_dset_legends([metafos[ds] for ds in dsets])
        legend_title = methstr(
            methods[0]
        ) if study == 'kate-qrs' else None  # for kate-qrs we need to put the subject _and_ the isotype in the title, so there's no room for the method
        pie_chart_faces = False
        print '%s:' % utils.color('green', methods[0]),
    elif len(methods) > 1 and len(dsets) == 1:  # method vs method
        glslabels = methods
        title = get_dset_title([mfo])
        title_color = None
        legends = [methstr(m) + ' only' for m in methods]
        legend_title = None
        pie_chart_faces = True
        print '%s:' % utils.color('green', dsets[0]),
    else:
        raise Exception('one of \'em has to be length 1: %d %d' %
                        (len(methods), len(dsets)))
    print '%s' % (' %s ' % utils.color('light_blue', 'vs')).join(glslabels)
    make_gls_tree_plot(args,
                       outdir + '/' + '-vs-'.join(methods) + '/gls-gen-plots',
                       study + '-' + '-vs-'.join(dsets),
                       glsfnames=[
                           get_gls_fname(ddir,
                                         meth,
                                         locus=mfo['locus'],
                                         data=True) for ddir in data_outdirs
                           for meth in methods
                       ],
                       glslabels=glslabels,
                       locus=mfo['locus'],
                       title=title,
                       title_color=title_color,
                       legends=legends,
                       legend_title=legend_title,
                       pie_chart_faces=pie_chart_faces)
Esempio n. 2
0
def write_single_zenodo_subdir(zenodo_dir, args, study, dset, method, mfo):
    method_outdir = heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset
    gls_dir = get_gls_dir(method_outdir, method, data=True)
    print '            %s --> %s' % (gls_dir, zenodo_dir)
    glfo = glutils.read_glfo(gls_dir, mfo['locus'], remove_orfs='partis' in method)
    glutils.write_glfo(zenodo_dir, glfo)
    if method == 'partis':
        # allele finding plots
        plotdir = gls_dir.replace('hmm/germline-sets', 'plots/sw/allele-finding')
        if not os.path.exists(zenodo_dir + '/fits'):
            os.makedirs(zenodo_dir + '/fits')
        for genedir in glob.glob(plotdir + '/try-0/*'):  # would be nice to copy html, but links will be wrong
            subprocess.check_call(['cp', '-r', genedir, zenodo_dir + '/fits/'])

        # csv prevalence files
        for tmpreg in utils.regions:
            with open(gls_dir.replace('/germline-sets', '/%s_gene-probs.csv' % tmpreg)) as infile:
                reader = csv.DictReader(infile)
                countfo = {line['%s_gene' % tmpreg] : int(line['count']) for line in reader}
                old_total = sum(countfo.values())
                orf_genes = [g for g in countfo if g not in glfo['seqs'][tmpreg]]  # this is kind of dangerous... but the genes are read from the same parameter dir that we're reading this prevalence file, so the only way it's gonna be missing is if we just removed it with the read_glfo() line above
                for ogene in orf_genes:
                    # if tmpreg == 'v':
                    #     _, nearest_gene, _ = glutils.find_nearest_gene_with_same_cpos(glfo, glfo['seqs'][tmpreg][ogene])  # oops, that's dumb... of course it isn't there
                    # else:
                    nearest_gene = glutils.find_nearest_gene_using_names(glfo, ogene)
                    # print '  adding %d to %s from %s' % (countfo[ogene], utils.color_gene(nearest_gene), utils.color_gene(ogene))
                    countfo[nearest_gene] += countfo[ogene]
                for ogene in orf_genes:
                    del countfo[ogene]
                assert old_total == sum(countfo.values())
                with open('%s/%s_gene-probs.csv' % (zenodo_dir, tmpreg), 'w') as outfile:
                    writer = csv.DictWriter(outfile, ('%s_gene' % tmpreg, 'count'))
                    writer.writeheader()
                    for gene in countfo:
                        writer.writerow({'%s_gene' % tmpreg : gene, 'count' : countfo[gene]})
    elif method == 'tigger-default':
        # doesn't seem to have written anything
        pass
    elif method == 'igdiscover':
        # for fname in ['errorhistograms.pdf', 'V_usage.pdf', 'V_usage.tab']:
        #     subprocess.check_call(['cp', '%s/work/final/%s' % (gls_dir, fname), zenodo_dir + '/'])
        subprocess.check_call(['cp', '-r', '%s/work/final' % gls_dir, zenodo_dir + '/'])  # aw, screw it, just write everything. The simulation stuff is already huge, anyway
    else:
        assert False
Esempio n. 3
0
def get_data_plots(args, region, baseoutdir, methods, study, dsets):
    metafos = heads.read_metadata(study)
    assert len(set([metafos[ds]['locus'] for ds in dsets]))  # make sure everybody has the same locus
    mfo = metafos[dsets[0]]
    data_outdirs = [heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + ds for ds in dsets]
    outdir = get_outdir(args, baseoutdir, varname='data', varval=study + '/' + '-vs-'.join(dsets))  # for data, only the plots go here, since datascripts puts its output somewhere else
    title, title_color, legends, legend_title = None, None, None, None
    pie_chart_faces = False
    if len(dsets) > 1 and len(methods) == 1:  # sample vs sample
        glslabels = dsets
        title = get_dset_title([metafos[ds] for ds in dsets])
        if study != 'kate-qrs':
            title += '  %s' % methstr(methods[0])
        title_color = methods[0]
        legends = get_dset_legends([metafos[ds] for ds in dsets])
        legend_title = methstr(methods[0]) if study == 'kate-qrs' else None  # for kate-qrs we need to put the subject _and_ the isotype in the title, so there's no room for the method
        print '%s:' % utils.color('green', methods[0]),
    elif len(methods) > 1 and len(dsets) == 1:  # method vs method
        glslabels = methods
        title = get_dset_title([mfo])
        title_color = None
        legends = [methstr(m) + ' only' for m in methods]
        legend_title = None
        pie_chart_faces = len(methods) > 2  # True
        print '%s:' % utils.color('green', dsets[0]),
    else:  # single sample plot
        glslabels = dsets
    print '%s' % (' %s ' % utils.color('light_blue', 'vs')).join(glslabels)
    plotdir = outdir + '/' + '-vs-'.join(methods) + '/gls-gen-plots'
    if args.all_regions:  # NOTE not actually checking this by running... but it's the same as the gls-gen one, so it should be ok
        plotdir += '/' + region
    param_dirs = None
    if args.add_gene_counts_to_tree_plots:  # this returns 'None' for non-partis methods, which is ok for now, but I think I do usually have the parameter dir somewhere if I've run the annotation performance stuff
        param_dirs = [get_param_dir(ddir, meth) for ddir in data_outdirs for meth in methods]
    make_gls_tree_plot(args, region, plotdir, study + '-' + '-vs-'.join(dsets),
                       glsfnames=[get_gls_fname(region, ddir, meth, locus=mfo['locus'], data=True) for ddir in data_outdirs for meth in methods],
                       glslabels=glslabels,
                       locus=mfo['locus'],
                       title=title,
                       title_color=title_color,
                       legends=legends,
                       legend_title=legend_title,
                       pie_chart_faces=pie_chart_faces,
                       param_dirs=param_dirs)
Esempio n. 4
0
def get_data_plots(args, baseoutdir, method):
    for var in args.varvals:
        study, dset = var.split('/')
        mfo = heads.read_metadata(study)[dset]
        data_outdir = heads.get_datadir(
            study, 'processed',
            extra_str='gls-gen-paper-' + args.label) + '/' + dset
        outdir = get_outdir(
            args, baseoutdir, varname='data', varval=study + '/' + dset
        )  # for data, only the plots go here, since datascripts puts its output somewhere else
        make_gls_tree_plot(args,
                           outdir + '/' + method + '/gls-gen-plots',
                           study + '-' + dset,
                           glsfnames=[
                               get_gls_fname(data_outdir,
                                             method,
                                             locus=mfo['locus'],
                                             data=True)
                           ],
                           glslabels=['data'])
Esempio n. 5
0
def print_data_table(dsetfos, method, latex=False, emph_genes=['IGHV1-2*02+G35A', 'IGHD3-10*01', 'IGHJ4*02', 'IGKV3-15*01', 'IGKJ3*01']):
    latex = True
    def getvalstr(gene, val):
        if gene is None or (utils.get_region(gene) == 'd' and not utils.has_d_gene(utils.get_locus(gene))):
            return '%s  %5.2s  %s  %-16s%s' % (cstr, ' - ', cstr, ' - ', 4 * ' ' if latex else '')
        else:
            if latex:
                gstr = utils.shorten_gene_name(gene, use_one_based_indexing=True, n_max_mutstrs=5)
                if emph_genes is not None and gene in emph_genes:
                    gstr = '\\color{red}{\\textbf{%s}}' % gstr
            else:
                gstr = utils.color_gene(gene, width=18)
            return '%s  %s%5.2f%s %s %-20s' % (cstr, estr, 100 * val, estr, cstr, gstr)
    def print_line(rfos):
        print '  %s%s'  % ('   '.join([getvalstr(g, v) for g, v in rfos]), lstr)
    def ds_str(ds, region):
        lstr = ds.split('-')[1]
        return ('IG%s%s' % (('h' if lstr in ['g', 'm'] else lstr).upper(), region.upper())) if latex else ds

    cstr = '&' if latex else ''
    estr = '$' if latex else ''
    lstr = '\\\\' if latex else ''
    for region in utils.regions:
        param_dirs = [get_param_dir(heads.get_datadir(study, 'processed', extra_str=args.label) + '/' + dset, method) for study, dset in dsetfos]
        countfos = [utils.read_overall_gene_probs(pdir, normalize=True)[region] for pdir in param_dirs]
        gene_val_str = (' %s   ' % cstr).join([('  %s   %s   %-20s' % ('\\%' if latex else '', cstr, ds_str(ds, region))) for _, ds in dsetfos])
        tmpline = '  %s   %s  %s' % (cstr, gene_val_str, lstr)
        if latex:
            hstr = '\\hline'
            tmpline = '  %s\n%s\n  %s' % (hstr, tmpline, hstr)
        print tmpline
        rowfos = [sorted(cfo.items(), key=operator.itemgetter(1), reverse=True) for cfo in countfos]
        irow = 0
        while True:
            rfos = [rfo[irow] if irow < len(rfo) else (None, None) for rfo in rowfos]
            if set(rfos) == set([(None, None)]):
                break
            print_line(rfos)
            irow += 1
Esempio n. 6
0
def get_data_pair_plots(args, baseoutdir, method, study, dsets):
    mfo = heads.read_metadata(study)[dsets[0]]
    assert heads.read_metadata(study)[dsets[1]]['locus'] == mfo['locus']
    data_outdirs = [
        heads.get_datadir(
            study, 'processed', extra_str='gls-gen-paper-' + args.label) +
        '/' + ds for ds in dsets
    ]
    outdir = get_outdir(
        args,
        baseoutdir,
        varname='data',
        varval=study + '/' + '-vs-'.join(dsets)
    )  # for data, only the plots go here, since datascripts puts its output somewhere else
    make_gls_tree_plot(args,
                       outdir + '/' + method + '/gls-gen-plots',
                       study + '-' + '-vs-'.join(dsets),
                       glsfnames=[
                           get_gls_fname(dout,
                                         method,
                                         locus=mfo['locus'],
                                         data=True) for dout in data_outdirs
                       ],
                       glslabels=dsets)