コード例 #1
0
def make_gls_tree_plot(args, region, plotdir, plotname, glsfnames, glslabels, locus, ref_label=None, title=None, title_color=None, legends=None, legend_title=None, pie_chart_faces=False, param_dirs=None):
    # ete3 requires its own python version, so we run as a subprocess
    cmdstr = 'export PATH=%s:$PATH && xvfb-run -a ./bin/plot-gl-set-trees.py' % args.ete_path
    cmdstr += ' --plotdir ' + plotdir
    cmdstr += ' --plotname ' + plotname
    cmdstr += ' --glsfnames ' + ':'.join(glsfnames)
    cmdstr += ' --glslabels ' + ':'.join(glslabels)
    cmdstr += ' --region ' + region
    if ref_label is not None:
        cmdstr += ' --ref-label ' + ref_label
    if title is not None:
        cmdstr += ' --title="%s"' % title
    if title_color is not None:
        cmdstr += ' --title-color %s' % title_color
    if legends is not None:
        cmdstr += ' --legends=' + ':'.join('"%s"' % l for l in legends)
    if legend_title is not None:
        cmdstr += ' --legend-title="%s"' % legend_title
    if pie_chart_faces:
        cmdstr += ' --pie-chart-faces'
    if param_dirs is not None:
        cmdstr += ' --param-dirs %s' % ':'.join(param_dirs)
    cmdstr += ' --locus ' + locus
    if args.plotcache:
        cmdstr += ' --use-cache'
    if args.only_print:
        cmdstr += ' --only-print'
    utils.simplerun(cmdstr, shell=True, debug=args.dryrun, dryrun=args.dryrun)
コード例 #2
0
def run_other_method(args, method):
    if method not in [
            'tigger', 'igdiscover'
    ]:  # really just to make it easier to search for this fcn
        assert False
    if utils.output_exists(args, get_outfname(args, method)):
        return
    simfasta = utils.getprefix(args.simfname) + '.fa'
    utils.csv_to_fasta(args.simfname,
                       outfname=simfasta,
                       overwrite=False,
                       remove_duplicates=True)
    cmd = './test/%s-run.py' % method
    cmd += ' --infname ' + simfasta
    cmd += ' --outfname ' + get_outfname(args, method)
    if args.overwrite:
        cmd += ' --overwrite'
    if args.gls_gen:
        cmd += ' --gls-gen'
        cmd += ' --glfo-dir ' + partis_dir + '/data/germlines/human'  # the partis mehods have this as the default internally, but we want/have to set it explicitly here
    else:
        cmd += ' --glfo-dir ' + args.inf_glfo_dir
    if method != 'igdiscover':  # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
        cmd += ' --workdir ' + args.workdir + '/' + method
    cmd += ' --n-procs ' + str(args.n_procs)

    utils.simplerun(cmd, dryrun=args.dry_run)
コード例 #3
0
ファイル: igdiscover-run.py プロジェクト: virologist/partis
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper()

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
コード例 #4
0
def run_partis(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    aligned_gl_seqs = {}  # keyed by seq so it's easy to check for duplicates
    for r in utils.regions:  # deduplicate before passing to partis
        for seqfo in utils.read_fastx(get_glfname(r, aligned=True)):
            if seqfo['seq'] in aligned_gl_seqs:
                continue
            aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs'])
    aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa'
    with open(aligned_germline_fname, 'w') as merged_file:
        for seq, gene in aligned_gl_seqs.items():
            merged_file.write('>%s\n%s\n' % (gene, seq))

    cmd = './bin/partis cache-parameters'
    cmd += ' --infname ' + infname
    cmd += ' --leave-default-germline'
    cmd += ' --presto-output --only-smith-waterman'
    cmd += ' --outfname ' + outfname
    if args.glfo_dir is not None:
        cmd += ' --initial-germline-dir ' + args.glfo_dir
    cmd += ' --aligned-germline-fname ' + aligned_germline_fname
    cmd += ' --n-procs ' + str(args.n_procs)

    utils.simplerun(cmd, print_time='partis annotation')

    os.remove(aligned_germline_fname)
コード例 #5
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % args.infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    cmdfname = outdir + '/run.sh'
    with open(cmdfname, 'w') as cmdfile:
        for cmd in cmds:
            cmdfile.write(cmd + '\n')
    subprocess.check_call(['chmod', '+x', cmdfname])
    cmdfos = [{
        'cmd_str': cmdfname,
        'workdir': outdir,
        'outfname': outdir + '/work/final/%s_usage.tab' % 'v'.upper()
    }]
    utils.simplerun(cmdfname, shell=True, print_time='igdiscover')
コード例 #6
0
def run_other_method(args, method):
    if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']:  # really just to make it easier to search for this fcn
        assert False
    assert args.n_max_queries is None
    if utils.output_exists(args, get_outfname(args, method)):
        return
    simfasta = utils.getprefix(args.simfname) + '.fa'
    utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True)
    cmd = './test/%s-run.py' % method.split('-')[0]
    if method == 'tigger-tuned':
        cmd += ' --tuned-tigger-params'
    cmd += ' --infname ' + simfasta
    cmd += ' --outfname ' + get_outfname(args, method)
    if args.species != 'human':
        cmd += ' --species %s' % args.species
    if args.overwrite:
        cmd += ' --overwrite'
    if args.gls_gen:
        cmd += ' --gls-gen'
        cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir  # the partis mehods have this as the default internally, but we want/have to set it explicitly here
    else:
        cmd += ' --glfo-dir ' + args.inf_glfo_dir
    cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
    if method != 'igdiscover':  # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
        cmd += ' --workdir ' + args.workdir + '/' + method
    cmd += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd += ' --slurm'

    utils.simplerun(cmd, dryrun=args.dry_run)
コード例 #7
0
def run_changeo(infname, igblast_outfname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    glfnames = [get_glfname(r, aligned=True) for r in utils.regions]
    cmd = args.changeo_path + '/bin/MakeDb.py igblast'
    cmd += ' -i %s -s %s -r %s --regions --scores' % (igblast_outfname, infname, ' '.join(glfnames))
    utils.simplerun(cmd, print_time='changeo')
コード例 #8
0
def partition():
    n_procs = 1
    cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s/params --n-procs %d --seed %d' % (
        simfname(args.stype), infdir(args.stype), n_procs, args.seed)
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
    cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --lb-tau %f --is-simu --get-tree-metrics --infname %s --parameter-dir %s/params --plotdir %s --n-procs %d --outfname %s/partition.yaml --seed %d' % (
        args.lb_tau, simfname(args.stype), infdir(args.stype),
        infdir(args.stype) + '/plots', n_procs, infdir(args.stype), args.seed)
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
コード例 #9
0
ファイル: bcr-phylo-run.py プロジェクト: linchunjen/partis
def cache_parameters():
    if utils.output_exists(args,
                           param_dir() + '/hmm/hmms',
                           outlabel='parameters',
                           offset=4):
        return
    cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --n-procs %d --seed %d' % (
        simfname(), param_dir(), args.n_procs, args.seed)
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
コード例 #10
0
def run_partis_parameter_cache(args, method):
    if utils.output_exists(args, get_outfname(args, method)):
        return

    paramdir = args.outdir + '/' + method
    plotdir = args.outdir + '/' + method + '/plots'

    # remove any old sw cache files
    sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, args.locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman'
    cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir
    if method == 'partis':
        cmd_str += ' --debug-allele-finding'  # --always-find-new-alleles'
        cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
        if args.allele_cluster:
            cmd_str += ' --allele-cluster'
            if args.kmeans_allele_cluster:
                cmd_str += ' --kmeans-allele-cluster'
    elif method == 'full':
        cmd_str += ' --leave-default-germline'
    else:
        assert False

    if args.species != 'human':
        cmd_str += ' --species %s' % args.species

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if not args.gls_gen:  # otherwise it uses the default (full) germline dir
        cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir  # --dont-remove-unlikely-alleles

    cmd_str += ' --parameter-dir ' + paramdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    if args.plot_and_fit_absolutely_everything is not None:
        cmd_str += ' --plot-and-fit-absolutely-everything ' + str(
            args.plot_and_fit_absolutely_everything)
    utils.simplerun(cmd_str, dryrun=args.dryrun)
コード例 #11
0
def make_gls_tree_plot(args, plotdir, plotname, glsfnames, glslabels):
    # ete3 requires its own python version, so we run as a subprocess
    cmdstr = 'export PATH=%s:$PATH && xvfb-run -a ./bin/plot-gl-set-trees.py' % args.ete_path
    cmdstr += ' --plotdir ' + plotdir
    cmdstr += ' --plotname ' + plotname
    cmdstr += ' --glsfnames ' + ':'.join(glsfnames)
    cmdstr += ' --glslabels ' + ':'.join(glslabels)
    if args.plotcache:
        cmdstr += ' --use-cache'
    utils.simplerun(cmdstr, shell=True)
コード例 #12
0
ファイル: bcr-phylo-run.py プロジェクト: linchunjen/partis
def rearrange():
    if utils.output_exists(args,
                           naive_fname(),
                           outlabel='naive simu',
                           offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(
        args.debug), args.seed, naive_fname(), args.n_sim_events)
    utils.simplerun(cmd, debug=True)
コード例 #13
0
ファイル: bcr-phylo-run.py プロジェクト: Xiujia-Yang/partis
def cache_parameters():
    if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4):
        return
    cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --seed %d --no-indels' % (simfname(), param_dir(), args.seed)  # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    if args.n_max_queries is not None:
        cmd += ' --n-max-queries %d' % args.n_max_queries
    utils.simplerun(cmd, debug=True) #, dryrun=True)
コード例 #14
0
ファイル: bcr-phylo-run.py プロジェクト: eharkins/partis
def partition():
    if utils.output_exists(args,
                           partition_fname(),
                           outlabel='partition',
                           offset=4):
        return
    cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --is-simu --get-tree-metrics --infname %s --parameter-dir %s --plotdir %s --n-procs %d --outfname %s --seed %d' % (
        simfname(), param_dir(), infdir() + '/plots', args.n_procs,
        partition_fname(), args.seed)
    if args.lb_tau is not None:
        cmd += ' --lb-tau %f' % args.lb_tau
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
コード例 #15
0
ファイル: bcr-phylo-run.py プロジェクト: Xiujia-Yang/partis
def rearrange():
    if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(args.debug), args.seed, naive_fname(), args.n_sim_events)
    if args.restrict_available_genes:
        cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, debug=True)
コード例 #16
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
コード例 #17
0
ファイル: bcr-phylo-run.py プロジェクト: linchunjen/partis
def partition():
    if utils.output_exists(args,
                           partition_fname(),
                           outlabel='partition',
                           offset=4):
        return
    cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --infname %s --parameter-dir %s --n-procs %d --outfname %s --seed %d' % (
        simfname(), param_dir(), args.n_procs, partition_fname(), args.seed)
    #  --write-additional-cluster-annotations 0:5  # I don't think there was really a good reason for having this
    if not args.dont_get_tree_metrics:
        cmd += ' --get-tree-metrics --plotdir %s' % (infdir() + '/plots')
    if args.lb_tau is not None:
        cmd += ' --lb-tau %f' % args.lb_tau
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
コード例 #18
0
def run_single_test(args, baseoutdir, val, n_events, method):
    cmd = get_base_cmd(args, n_events, method)
    outdir = get_outdir(args, baseoutdir, args.action, val, n_events=n_events)
    sim_v_genes = [args.v_genes[0]]
    nsnpstr, nindelstr = '1', ''
    if args.action == 'mfreq':
        cmd += ' --mut-mult ' + str(val)
    elif args.action == 'nsnp':
        nsnpstr = str(val)
    elif args.action == 'multi-nsnp':
        nsnpstr = ':'.join([str(n) for n in val])
        sim_v_genes *= len(val)
    elif args.action == 'prevalence':
        cmd += ' --allele-prevalence-freqs ' + str(1. - val) + ':' + str(
            val
        )  # i.e. previously-known allele has 1 - p, and new allele has p
    elif args.action == 'n-leaves':
        cmd += ' --n-leaves ' + str(
            val
        )  # NOTE default of 1 (for other tests) is set in test-allele-finding.py
        cmd += ' --n-leaf-distribution geometric'
        cmd += ' --n-max-queries ' + str(
            n_events
        )  # i.e. we simulate <n_events> rearrangement events, but then only use <n_events> sequences for inference
    elif args.action == 'weibull':
        cmd += ' --n-leaves 5'  # NOTE default of 1 (for other tests) is set in test-allele-finding.py
        cmd += ' --n-leaf-distribution geometric'
        cmd += ' --n-max-queries ' + str(
            n_events
        )  # i.e. we simulate <n_events> rearrangement events, but then only use <n_events> sequences for inference
    elif args.action == 'alcluster':
        nsnpstr = val['snp']
        nindelstr = val['indel']
        sim_v_genes *= len(val['snp'].split(':'))
    elif args.action == 'gls-gen':
        nsnpstr = '1:1:2:3:100:100'
        nindelstr = '0:0:0:0:3:3'
        cmd += ' --gls-gen'
    else:
        assert False

    if args.action != 'gls-gen':
        cmd += ' --sim-v-genes ' + ':'.join(sim_v_genes)
    if '--nosim' not in cmd:
        if nsnpstr != '':
            cmd += ' --nsnp-list ' + nsnpstr
        if nindelstr != '':
            cmd += ' --nindel-list ' + nindelstr
    cmd += ' --outdir ' + outdir
    utils.simplerun(cmd, dryrun=args.dry_run)
コード例 #19
0
def rearrange():
    if utils.output_exists(args,
                           naive_fname(),
                           outlabel='naive simu',
                           offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(
        args.debug), args.seed, naive_fname(), args.n_sim_events)
    if args.n_procs > 1 and args.n_sim_events % args.n_procs == 0:  # if --n-procs is not divisble by --n-sim-events, partis simulate doesn't give you exactly the number you asked for
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, debug=True)
コード例 #20
0
def simulate(args):
    if utils.output_exists(args, args.simfname):
        return
    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
    if args.n_leaf_distribution is None:
        cmd_str += ' --constant-number-of-leaves'
    else:
        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
    if args.mut_mult is not None:
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
    if args.root_mrca_weibull_parameter is not None:
        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm --subsimproc'

    allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'

    # figure what genes we're using
    if args.gls_gen:
        assert args.sim_v_genes is None and args.allele_prevalence_freqs is None

        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
        glutils.remove_v_genes_with_bad_cysteines(sglfo)
        glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
        cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname
    else:
        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set

        if args.allele_prevalence_freqs is not None:
            if not utils.is_normed(args.allele_prevalence_freqs):
                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
            glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname)
            cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname

    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
    # glutils.print_glfo(sglfo)

    # run simulation
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
コード例 #21
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
コード例 #22
0
def run_igblast(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    if args.glfo_dir is not None:
        print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color('red', 'warning')

    cmd = './igblastn'
    cmd += ' -germline_db_V human_gl_V -germline_db_D human_gl_V -germline_db_J human_gl_J'
    cmd += ' -auxiliary_data optional_file/human_gl.aux'
    cmd += ' -domain_system imgt -ig_seqtype Ig -organism human -outfmt \'7 std qseq sseq btop\''
    cmd += ' -num_threads %d' % args.n_procs
    cmd += ' -query ' + infname + ' -out ' + outfname
    
    cmd = 'cd %s; %s' % (args.igbdir, cmd)
    utils.simplerun(cmd, shell=True, print_time='igblast')
コード例 #23
0
ファイル: bcr-phylo-run.py プロジェクト: AkselObdrup/partis
def cache_parameters():
    if utils.output_exists(args,
                           ifname('params'),
                           outlabel='parameters',
                           offset=4):
        return
    cmd = './bin/partis cache-parameters --seed %d --no-indels' % args.seed  # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
    fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s'
    cmd += fstr % (spath('mutated'), ipath('params'))
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    if args.n_max_queries is not None:
        cmd += ' --n-max-queries %d' % args.n_max_queries
    utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
コード例 #24
0
def multiple_tests(args):
    def getlogdir(iproc):
        logdir = args.outdir + '/' + str(iproc) + '/logs'
        if args.plot_annotation_performance:
            logdir += '/annotation-performance-plots'
        return logdir + '/' + '-'.join(args.methods)

    def cmd_str(iproc):
        clist = copy.deepcopy(sys.argv)
        utils.remove_from_arglist(clist, '--n-tests', has_arg=True)
        utils.remove_from_arglist(clist, '--iteststart', has_arg=True)
        utils.replace_in_arglist(clist, '--outdir',
                                 args.outdir + '/' + str(iproc))
        utils.replace_in_arglist(clist, '--seed', str(args.seed + iproc))
        # clist.append('--slurm')
        return ' '.join(clist)

    for iproc in range(
            args.iteststart, args.n_tests
    ):  # don't overwrite old log files... need to eventually fix this so it isn't necessary

        def lfn(iproc, ilog):
            logfname = args.outdir + '/' + str(iproc) + '/log'
            if ilog > 0:
                logfname += '.' + str(ilog)
            return logfname

    cmdfos = [{
        'cmd_str': cmd_str(iproc),
        'workdir': args.workdir + '/' + str(iproc),
        'logdir': getlogdir(iproc),
        'outfname': args.outdir + '/' + str(iproc)
    } for iproc in range(args.iteststart, args.n_tests)]
    if args.dry_run:
        for iproc in range(args.iteststart, args.n_tests):
            utils.simplerun(cmdfos[iproc - args.iteststart]['cmd_str'],
                            dryrun=True)
        return
    for iproc in range(args.iteststart, args.n_tests):
        logd = getlogdir(iproc)
        if os.path.exists(logd + '/log'):
            ilog = 0
            while os.path.exists(logd + '/log.' + str(ilog)):
                ilog += 1
            check_call(['mv', '-v', logd + '/log', logd + '/log.' + str(ilog)])
    print '  look for logs in %s' % args.outdir
    utils.run_cmds(cmdfos, debug='write')
コード例 #25
0
ファイル: igdiscover-run.py プロジェクト: tylernstarr/partis
def update_igdiscover():
    cmds = getpathcmd()

    # # ----------------------------------------------------------------------------------------
    # # non-dev version:

    # args.env_label = 'igdiscover'

    # # install:
    # cmds += ['conda config --add channels defaults']
    # cmds += ['conda config --add channels conda-forge']
    # cmds += ['conda config --add channels bioconda']
    # cmds += ['conda create -n %s igdiscover' % args.env_label]
    # cmds += ['conda activate %s' % args.env_label]

    # # update:
    # cmds += ['conda activate %s' % args.env_label]
    # cmds += ['igdiscover --version']
    # cmds += ['conda update igdiscover']
    # cmds += ['igdiscover --version']

    # ----------------------------------------------------------------------------------------
    # dev version:
    args.env_label = 'igdiscover-dev'
    install_dir = partis_dir + '/packages'
    if not os.path.exists(install_dir):
        os.makedirs(install_dir)
    cmds += ['cd %s' % install_dir]

    # install:
    # cmds += ['git clone https://github.com/NBISweden/IgDiscover.git']
    # cmds += ['cd IgDiscover']
    # cmds += ['conda env create -n %s -f environment.yml' % args.env_label]
    # cmds += ['source activate %s' % args.env_label]
    # cmds += ['python3 -m pip install -e .']
    # cmds += ['igdiscover --version']

    # update dev version:
    cmds += ['cd IgDiscover']
    cmds += ['git pull']
    cmds += ['source activate %s' % args.env_label]
    cmds += ['igdiscover --version']

    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname='/tmp/tmprun.sh',
                    debug=True)
コード例 #26
0
ファイル: treegenerator.py プロジェクト: Xiujia-Yang/partis
    def read_input_tree_file(self, outfname):
        if self.args.debug:
            print '  reading trees from %s' % self.args.input_simulation_treefname
        utils.simplerun('cp %s %s' %
                        (self.args.input_simulation_treefname, outfname),
                        debug=False)
        ages, treestrs = [], []
        with open(outfname) as treefile:
            for line in treefile:
                tstr = line.strip()
                if tstr == '':  # skip empty lines
                    continue
                dtree = treeutils.get_dendro_tree(
                    treestr=tstr, suppress_internal_node_taxa=True)
                if dtree.seed_node.edge_length is None:  # make sure root edge length is set (otherwise bppseqgen barfs)
                    dtree.seed_node.edge_length = 0.
                old_new_label_pairs = [
                    (l.taxon.label, 't%d' % (i + 1))
                    for i, l in enumerate(dtree.leaf_node_iter())
                ]
                treeutils.translate_labels(
                    dtree, old_new_label_pairs
                )  # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this  to check that e.g. bppseqgen didn't screw up the ordering)
                age = self.choose_full_sequence_branch_length()
                if self.args.debug > 1:  # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height
                    print '    input tree %d (rescaled depth %.3f --> %.3f):' % (
                        len(ages), treeutils.get_mean_leaf_height(tree=dtree),
                        age)
                treeutils.rescale_tree(
                    age, dtree=dtree
                )  # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now
                ages.append(age)
                treestrs.append(dtree.as_string(schema='newick').strip())
                if self.args.debug > 1:
                    print utils.pad_lines(treeutils.get_ascii_tree(dtree))
        if any(a > 1. for a in ages):
            raise Exception(
                'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s'
                % (self.args.input_simulation_treefname, ages))
        if len(ages) != self.args.n_trees:
            print '    resetting --n-trees from %d to %d to match trees read from %s' % (
                self.args.n_trees, len(ages),
                self.args.input_simulation_treefname)
        self.args.n_trees = len(ages)

        return ages, treestrs
コード例 #27
0
def run_data(args, baseoutdir, study, dset, method):
    cmd = './datascripts/run.py cache-parameters'
    cmd += ' --study ' + study
    cmd += ' --dsets ' + dset
    assert args.label is not None  # it's got a default now, so it shouldn't anymore be None
    cmd += ' --extra-str gls-gen-paper-' + args.label
    if args.no_slurm:
        cmd += ' --no-slurm'
    cmd += ' --n-procs ' + str(args.n_procs_per_test)
    if args.n_random_queries is not None:
        assert method == 'partis'  # I don't think it works for any others a.t.m.
        cmd += ' --n-random-queries ' + str(args.n_random_queries)
    if args.check:
        cmd += ' --check'
    if method != 'partis':
        cmd += ' --other-method ' + method

    utils.simplerun(cmd, dryrun=args.dry_run)
コード例 #28
0
def run_performance_plot(args, method):
    perf_outdir = get_outfname(args, method, annotation_performance_plots=True)
    if utils.output_exists(args, perf_outdir):
        return

    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance'
    cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'
    cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True)  # i.e. use the inferred glfo from <method>
    cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir'
    cmd_str += ' --only-overall-plots --plotdir ' + perf_outdir
    cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters'  # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(args.n_max_queries)  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
コード例 #29
0
ファイル: bcr-phylo-run.py プロジェクト: CollinJ0/partis
def run_bcr_phylo(naive_line, outdir, ievent):
    tmpdir = utils.choose_random_subdir('/tmp/%s' % os.getenv('USER'))  # this is I think just for xvfb-run
    os.makedirs(tmpdir)
    prof_cmds = '' # '-m cProfile -s tottime -o prof.out'
    cmd = 'export TMPDIR=%s && export PATH=%s:$PATH && xvfb-run -a python %s %s/bin/simulator.py' % (tmpdir, ete_path, prof_cmds, bcr_phylo_path)

    if args.run_help:
        cmd += ' --help'
    elif args.stype == 'neutral':
        assert False  # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args)
        cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365)
        cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation
    elif args.stype == 'selection':
        cmd += ' --selection'
        cmd += ' --lambda %f' % args.branching_parameter
        cmd += ' --lambda0 %f' % args.base_mutation_rate
        cmd += ' --obs_times %s' % ' '.join(['%d' % t for t in args.obs_times])
        cmd += ' --n_to_sample %d' % args.n_sim_seqs_per_generation
        cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
        cmd += ' --target_dist %d' % args.target_distance
        cmd += ' --target_count %d' % args.target_count
        cmd += ' --carry_cap %d' % args.carry_cap
        cmd += ' --observe_common_ancestors'

        # cmd += ' --n_target_clusters 1'
        # cmd += ' --target_cluster_distance 1'

        # cmd += ' --observe_based_on_affinity'  # implementation in bcr-phylo needs some work
    else:
        assert False


    cmd += ' --debug 1'
    cmd += ' --no_context'
    cmd += ' --no_plot'
    cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
    cmd += ' --naive_seq %s' % naive_line['naive_seq']
    cmd += ' --random_seed %d' % (args.seed + ievent)
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    utils.simplerun(cmd, shell=True, extra_str='        ', debug=True) #, dryrun=True)
    os.rmdir(tmpdir)
コード例 #30
0
ファイル: bcr-phylo-run.py プロジェクト: AkselObdrup/partis
def rearrange():
    if utils.output_exists(
            args, naive_fname('igh'), outlabel='naive simu', offset=4
    ):  # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there)
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --n-sim-events %d' % (int(
        args.debug), args.seed, args.n_sim_events)
    if args.paired_loci:
        cmd += ' --paired-loci --paired-outdir %s' % spath('naive')
    else:
        cmd += ' --outfname %s' % spath('naive')
    if args.restrict_available_genes:
        assert not args.paired_loci
        cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, dryrun=args.dry_run, debug=True)