Exemple #1
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper()

    cmds = getpathcmd()
    cmds += ['conda activate %s' % args.env_label]
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True)

    template_gldir = args.glfo_dir  # if args.glfo_dir is not None else 'data/germlines/ XXX human'  # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up)
    glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Exemple #2
0
def run_other_method(args, method):
    if method not in ['tigger-default', 'tigger-tuned', 'igdiscover']:  # really just to make it easier to search for this fcn
        assert False
    assert args.n_max_queries is None
    if utils.output_exists(args, get_outfname(args, method)):
        return
    simfasta = utils.getprefix(args.simfname) + '.fa'
    utils.csv_to_fasta(args.simfname, outfname=simfasta, overwrite=False, remove_duplicates=True)
    cmd = './test/%s-run.py' % method.split('-')[0]
    if method == 'tigger-tuned':
        cmd += ' --tuned-tigger-params'
    cmd += ' --infname ' + simfasta
    cmd += ' --outfname ' + get_outfname(args, method)
    if args.species != 'human':
        cmd += ' --species %s' % args.species
    if args.overwrite:
        cmd += ' --overwrite'
    if args.gls_gen:
        cmd += ' --gls-gen'
        cmd += ' --glfo-dir ' + partis_dir + '/' + args.default_germline_dir  # the partis mehods have this as the default internally, but we want/have to set it explicitly here
    else:
        cmd += ' --glfo-dir ' + args.inf_glfo_dir
    cmd += ' --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
    if method != 'igdiscover':  # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
        cmd += ' --workdir ' + args.workdir + '/' + method
    cmd += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd += ' --slurm'

    utils.simplerun(cmd, dryrun=args.dry_run)
Exemple #3
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output(naive_fname())
    assert len(naive_event_list) == args.n_sim_events

    outdirs = [
        '%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))
    ]

    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        run_bcr_phylo(naive_line, outdir, ievent)

    if utils.output_exists(
            args, simfname(), outlabel='mutated simu', offset=4
    ):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    mutated_events = []
    for ievent, (naive_line,
                 outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(
            parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))

    print '  writing annotations to %s' % simfname()
    utils.write_annotations(simfname(), glfo, mutated_events,
                            utils.simulation_headers)

    import plotting
    for outdir, event in zip(outdirs, mutated_events):
        plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr,
                                           args.metric_for_target_distance)
Exemple #4
0
def run_other_method(args, method):
    if method not in [
            'tigger', 'igdiscover'
    ]:  # really just to make it easier to search for this fcn
        assert False
    if utils.output_exists(args, get_outfname(args, method)):
        return
    simfasta = utils.getprefix(args.simfname) + '.fa'
    utils.csv_to_fasta(args.simfname,
                       outfname=simfasta,
                       overwrite=False,
                       remove_duplicates=True)
    cmd = './test/%s-run.py' % method
    cmd += ' --infname ' + simfasta
    cmd += ' --outfname ' + get_outfname(args, method)
    if args.overwrite:
        cmd += ' --overwrite'
    if args.gls_gen:
        cmd += ' --gls-gen'
        cmd += ' --glfo-dir ' + partis_dir + '/data/germlines/human'  # the partis mehods have this as the default internally, but we want/have to set it explicitly here
    else:
        cmd += ' --glfo-dir ' + args.inf_glfo_dir
    if method != 'igdiscover':  # for now we're saving all the igdiscover output/intermediate files, so we write them to an output dir
        cmd += ' --workdir ' + args.workdir + '/' + method
    cmd += ' --n-procs ' + str(args.n_procs)

    utils.simplerun(cmd, dryrun=args.dry_run)
Exemple #5
0
def run_partis(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    aligned_gl_seqs = {}  # keyed by seq so it's easy to check for duplicates
    for r in utils.regions:  # deduplicate before passing to partis
        for seqfo in utils.read_fastx(get_glfname(r, aligned=True)):
            if seqfo['seq'] in aligned_gl_seqs:
                continue
            aligned_gl_seqs[seqfo['seq']] = '|'.join(seqfo['infostrs'])
    aligned_germline_fname = args.workdir + '/all-aligned-gl-seqs.fa'
    with open(aligned_germline_fname, 'w') as merged_file:
        for seq, gene in aligned_gl_seqs.items():
            merged_file.write('>%s\n%s\n' % (gene, seq))

    cmd = './bin/partis cache-parameters'
    cmd += ' --infname ' + infname
    cmd += ' --leave-default-germline'
    cmd += ' --presto-output --only-smith-waterman'
    cmd += ' --outfname ' + outfname
    if args.glfo_dir is not None:
        cmd += ' --initial-germline-dir ' + args.glfo_dir
    cmd += ' --aligned-germline-fname ' + aligned_germline_fname
    cmd += ' --n-procs ' + str(args.n_procs)

    utils.simplerun(cmd, print_time='partis annotation')

    os.remove(aligned_germline_fname)
Exemple #6
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % args.infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    cmdfname = outdir + '/run.sh'
    with open(cmdfname, 'w') as cmdfile:
        for cmd in cmds:
            cmdfile.write(cmd + '\n')
    subprocess.check_call(['chmod', '+x', cmdfname])
    cmdfos = [{
        'cmd_str': cmdfname,
        'workdir': outdir,
        'outfname': outdir + '/work/final/%s_usage.tab' % 'v'.upper()
    }]
    utils.simplerun(cmdfname, shell=True, print_time='igdiscover')
Exemple #7
0
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events, uid_str_len=None):
    if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4):
        return None

    cmd = '%s/bin/simulator.py' % bcr_phylo_path
    if args.run_help:
        cmd += ' --help'
    elif args.stype == 'neutral':
        assert False  # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args)
        cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365)
        cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation
    elif args.stype == 'selection':
        cmd += ' --selection'
        cmd += ' --lambda %f' % args.branching_parameter
        cmd += ' --lambda0 %f' % args.base_mutation_rate
        cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength)
        cmd += ' --obs_times %s' % ' '.join(['%d' % get_vpar_val('obs-times', t) for t in args.obs_times])
        cmd += ' --n_to_sample %s' % ' '.join('%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation)
        cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
        if args.paratope_positions is not None:
            cmd += ' --paratope_positions %s' % args.paratope_positions
        cmd += ' --target_dist %d' % args.target_distance
        cmd += ' --target_count %d' % args.target_count
        cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
        if not args.dont_observe_common_ancestors:
            cmd += ' --observe_common_ancestors'
        if args.leaf_sampling_scheme is not None:
            cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme
        if args.n_target_clusters is not None:
            cmd += ' --n_target_clusters %d' % args.n_target_clusters
        # cmd += ' --target_cluster_distance 1'
        if args.min_target_distance is not None:
            cmd += ' --min_target_distance %d' % args.min_target_distance
    else:
        assert False

    cmd += ' --debug %d' % args.debug
    cmd += ' --n_tries 1000'
    if args.context_depend == 0:
        cmd += ' --no_context'
    cmd += ' --no_plot'
    if args.only_csv_plots:
        cmd += ' --dont_write_hists'
    cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
    cmd += ' --random_seed %d' % (args.seed + ievent)
    if uid_str_len is not None:
        cmd += ' --uid_str_len %d' % uid_str_len
    cmd += ' --naive_seq %s' % naive_line['naive_seq']

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    cfo = None
    if args.n_procs == 1:
        utils.run_ete_script(cmd, ete_path)  # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
    else:
        cmd, _ = utils.run_ete_script(cmd, ete_path, return_for_cmdfos=True, tmpdir=outdir)
        cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)}
    return cfo
Exemple #8
0
def run_changeo(infname, igblast_outfname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    glfnames = [get_glfname(r, aligned=True) for r in utils.regions]
    cmd = args.changeo_path + '/bin/MakeDb.py igblast'
    cmd += ' -i %s -s %s -r %s --regions --scores' % (igblast_outfname, infname, ' '.join(glfnames))
    utils.simplerun(cmd, print_time='changeo')
Exemple #9
0
def cache_parameters():
    if utils.output_exists(args,
                           param_dir() + '/hmm/hmms',
                           outlabel='parameters',
                           offset=4):
        return
    cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --n-procs %d --seed %d' % (
        simfname(), param_dir(), args.n_procs, args.seed)
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
Exemple #10
0
def run_partis_parameter_cache(args, method):
    if utils.output_exists(args, get_outfname(args, method)):
        return

    paramdir = args.outdir + '/' + method
    plotdir = args.outdir + '/' + method + '/plots'

    # remove any old sw cache files
    sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv')
    if len(sw_cachefiles) > 0:
        for cachefname in sw_cachefiles:
            check_call(['rm', '-v', cachefname])
            sw_cache_gldir = cachefname.replace('.csv', '-glfo')
            if os.path.exists(
                    sw_cache_gldir
            ):  # if stuff fails halfway through, you can get one but not the other
                glutils.remove_glfo_files(sw_cache_gldir, args.locus)
                # os.rmdir(sw_cache_gldir)

    # generate germline set and cache parameters
    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman'
    cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir
    if method == 'partis':
        cmd_str += ' --debug-allele-finding'  # --always-find-new-alleles'
        cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'  # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info
        if args.allele_cluster:
            cmd_str += ' --allele-cluster'
            if args.kmeans_allele_cluster:
                cmd_str += ' --kmeans-allele-cluster'
    elif method == 'full':
        cmd_str += ' --leave-default-germline'
    else:
        assert False

    if args.species != 'human':
        cmd_str += ' --species %s' % args.species

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(
            args.n_max_queries
        )  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'

    if not args.gls_gen:  # otherwise it uses the default (full) germline dir
        cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir  # --dont-remove-unlikely-alleles

    cmd_str += ' --parameter-dir ' + paramdir
    cmd_str += ' --plotdir ' + plotdir
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    if args.plot_and_fit_absolutely_everything is not None:
        cmd_str += ' --plot-and-fit-absolutely-everything ' + str(
            args.plot_and_fit_absolutely_everything)
    utils.simplerun(cmd_str, dryrun=args.dryrun)
Exemple #11
0
def rearrange():
    if utils.output_exists(args,
                           naive_fname(),
                           outlabel='naive simu',
                           offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(
        args.debug), args.seed, naive_fname(), args.n_sim_events)
    utils.simplerun(cmd, debug=True)
Exemple #12
0
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events):
    if utils.output_exists(args,
                           bcr_phylo_fasta_fname(outdir),
                           outlabel='bcr-phylo',
                           offset=4):
        return

    cmd = '%s/bin/simulator.py' % bcr_phylo_path
    if args.run_help:
        cmd += ' --help'
    elif args.stype == 'neutral':
        assert False  # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args)
        cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365)
        cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation
    elif args.stype == 'selection':
        cmd += ' --selection'
        cmd += ' --lambda %f' % args.branching_parameter
        cmd += ' --lambda0 %f' % args.base_mutation_rate
        cmd += ' --selection_strength %f' % get_vpar_val(
            'selection-strength', args.selection_strength)
        cmd += ' --obs_times %s' % ' '.join(
            ['%d' % get_vpar_val('obs-times', t) for t in args.obs_times])
        cmd += ' --n_to_sample %s' % ' '.join(
            '%d' % get_vpar_val('n-sim-seqs-per-generation', n)
            for n in args.n_sim_seqs_per_generation)
        cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
        cmd += ' --target_dist %d' % args.target_distance
        cmd += ' --target_count %d' % args.target_count
        cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
        if not args.dont_observe_common_ancestors:
            cmd += ' --observe_common_ancestors'

        # cmd += ' --n_target_clusters 1'
        # cmd += ' --target_cluster_distance 1'

        # cmd += ' --observe_based_on_affinity'  # implementation in bcr-phylo needs some work
    else:
        assert False

    cmd += ' --debug %d' % args.debug
    cmd += ' --n_tries 30'
    cmd += ' --no_context'
    cmd += ' --no_plot'
    cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
    cmd += ' --random_seed %d' % (args.seed + ievent)
    if n_total_events > 1:  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
        cmd += ' --uid_str_len 7'
    cmd += ' --naive_seq %s' % naive_line['naive_seq']

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    utils.run_ete_script(
        cmd, ete_path
    )  # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
Exemple #13
0
def cache_parameters():
    if utils.output_exists(args, param_dir() + '/hmm/hmms', outlabel='parameters', offset=4):
        return
    cmd = './bin/partis cache-parameters --infname %s --parameter-dir %s --seed %d --no-indels' % (simfname(), param_dir(), args.seed)  # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    if args.n_max_queries is not None:
        cmd += ' --n-max-queries %d' % args.n_max_queries
    utils.simplerun(cmd, debug=True) #, dryrun=True)
Exemple #14
0
def partition():
    if utils.output_exists(args,
                           partition_fname(),
                           outlabel='partition',
                           offset=4):
        return
    cmd = './bin/partis partition --n-final-clusters 1 --write-additional-cluster-annotations 0:5 --is-simu --get-tree-metrics --infname %s --parameter-dir %s --plotdir %s --n-procs %d --outfname %s --seed %d' % (
        simfname(), param_dir(), infdir() + '/plots', args.n_procs,
        partition_fname(), args.seed)
    if args.lb_tau is not None:
        cmd += ' --lb-tau %f' % args.lb_tau
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
Exemple #15
0
def rearrange():
    if utils.output_exists(args, naive_fname(), outlabel='naive simu', offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(args.debug), args.seed, naive_fname(), args.n_sim_events)
    if args.restrict_available_genes:
        cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, debug=True)
Exemple #16
0
def run_igdiscover(infname, outfname, outdir):
    if utils.output_exists(args, outfname):
        return

    prepare_igdiscover_outdir(outdir)

    if args.n_random_queries is not None:
        sub_infname = outdir + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper(
    )

    cmds = ['#!/bin/bash']
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += [
        'export PYTHONNOUSERSITE=True'
    ]  # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448)
    cmds += ['cd %s' % outdir]
    cmds += ['igdiscover init --db db --single-reads %s work' % infname
             ]  # prepares to run, putting files into <outdir>
    cmds += ['cp %s work/' % os.path.basename(args.yamlfname)]
    cmds += ['cd work']
    cmds += ['igdiscover run']
    utils.simplerun('\n'.join(cmds) + '\n',
                    cmdfname=outdir + '/run.sh',
                    print_time='igdiscover',
                    debug=True)

    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        igdiscover_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo, debug=True)
Exemple #17
0
def write_locus_file(locus, ofos, lpair=None, extra_str='  '):
    ofn = utils.paired_fn(args.outdir, locus=locus, lpair=lpair)
    if utils.output_exists(args, ofn, leave_zero_len=len(ofos)==0, offset=4):  # NOTE not really sure this does anything (or if i want it) now that I'm cleaning/looking for the whole dir at the start of this script
        return
    if not os.path.exists(os.path.dirname(ofn)):
        os.makedirs(os.path.dirname(ofn))
    if len(ofos) == 0:
        # print '%s%s: nothing to write' % (extra_str, locus)
        open(ofn, 'w').close()
        return
    print '%s%s: %d to %s/%s' % (extra_str, locus, len(ofos), os.path.basename(os.path.dirname(ofn)), os.path.basename(ofn))
    with open(ofn, 'w') as lfile:
        for sfo in ofos:
            lfile.write('>%s\n%s\n' % (sfo['name'], sfo['seq']))
Exemple #18
0
def rearrange():
    if utils.output_exists(args,
                           naive_fname(),
                           outlabel='naive simu',
                           offset=4):
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --outfname %s --n-sim-events %d' % (int(
        args.debug), args.seed, naive_fname(), args.n_sim_events)
    if args.n_procs > 1 and args.n_sim_events % args.n_procs == 0:  # if --n-procs is not divisble by --n-sim-events, partis simulate doesn't give you exactly the number you asked for
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, debug=True)
Exemple #19
0
def partition():
    if utils.output_exists(args,
                           partition_fname(),
                           outlabel='partition',
                           offset=4):
        return
    cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --infname %s --parameter-dir %s --n-procs %d --outfname %s --seed %d' % (
        simfname(), param_dir(), args.n_procs, partition_fname(), args.seed)
    #  --write-additional-cluster-annotations 0:5  # I don't think there was really a good reason for having this
    if not args.dont_get_tree_metrics:
        cmd += ' --get-tree-metrics --plotdir %s' % (infdir() + '/plots')
    if args.lb_tau is not None:
        cmd += ' --lb-tau %f' % args.lb_tau
    utils.simplerun(cmd, debug=True)  #, dryrun=True)
Exemple #20
0
def simulate(args):
    if utils.output_exists(args, args.simfname):
        return
    cmd_str = args.partis_path + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --outfname ' + args.simfname + ' --n-leaves ' + str(args.n_leaves) + ' --rearrange-from-scratch --shm-parameter-dir ' + partis_dir + '/data/recombinator/scratch-parameters'
    if args.n_leaf_distribution is None:
        cmd_str += ' --constant-number-of-leaves'
    else:
        cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution
    if args.mut_mult is not None:
        cmd_str += ' --mutation-multiplier ' + str(args.mut_mult)
    if args.root_mrca_weibull_parameter is not None:
        cmd_str += ' --root-mrca-weibull-parameter ' + str(args.root_mrca_weibull_parameter)

    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.slurm:
        cmd_str += ' --batch-system slurm --subsimproc'

    allele_prevalence_fname = args.workdir + '/allele-prevalence-freqs.csv'

    # figure what genes we're using
    if args.gls_gen:
        assert args.sim_v_genes is None and args.allele_prevalence_freqs is None

        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus)
        glutils.remove_v_genes_with_bad_cysteines(sglfo)
        glutils.generate_germline_set(sglfo, args.n_genes_per_region, args.n_sim_alleles_per_gene, args.min_allele_prevalence_freq, allele_prevalence_fname, new_allele_info=args.new_allele_info, dont_remove_template_genes=args.dont_remove_template_genes, debug=True)
        cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname
    else:
        sglfo = glutils.read_glfo(args.default_germline_dir, locus=args.locus, only_genes=(args.sim_v_genes + args.dj_genes))
        added_snp_names = glutils.generate_new_alleles(sglfo, args.new_allele_info, debug=True, remove_template_genes=(not args.dont_remove_template_genes))  # NOTE template gene removal is the default for glutils.generate_germline_set

        if args.allele_prevalence_freqs is not None:
            if not utils.is_normed(args.allele_prevalence_freqs):
                raise Exception('--allele-prevalence-freqs %s not normalized' % args.allele_prevalence_freqs)
            if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']):  # already checked when parsing args, but, you know...
                raise Exception('--allele-prevalence-freqs %d not the same length as sglfo %d' % (len(args.allele_prevalence_freqs), len(sglfo['seqs']['v'])))
            gene_list = sorted(sglfo['seqs']['v']) if len(added_snp_names) == 0 else list(set(args.sim_v_genes)) + added_snp_names
            prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}}
            glutils.write_allele_prevalence_freqs(prevalence_freqs, allele_prevalence_fname)
            cmd_str += ' --allele-prevalence-fname ' + allele_prevalence_fname

    glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo)
    cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation'
    # glutils.print_glfo(sglfo)

    # run simulation
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
Exemple #21
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = ['library(tigger)', 'library(dplyr)']
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))]

    tigger_outfname = outdir + '/tigger.fasta'
    rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)]  #
    rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname
    utils.simplerun(cmdstr, shell=True, print_time='tigger')

    # post-process tigger .fa
    gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.read_glfo(gldir, args.locus)
    tigger_alleles = set()
    for seqfo in utils.read_fastx(tigger_outfname):
        seq = seqfo['seq'].replace(utils.gap_chars[0], '')  # it should be just dots...
        tigger_alleles.add(seqfo['name'])
        if seqfo['name'] not in glfo['seqs'][args.region]:
            newfo = {'gene' : seqfo['name'], 'seq' : seq}
            use_template_for_codon_info = False
            if '+' in newfo['gene']:
                newfo['template-gene'] = newfo['gene'].split('+')[0]
                use_template_for_codon_info = True
            glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True)
        elif glfo['seqs'][args.region][seqfo['name']] != seq:
            print '%s different sequences in glfo and tigger output for %s:\n    %s\n    %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq'])
    for gene in glfo['seqs'][args.region]:  # remove them afterwards so we can use existing ones to get codon info
        if gene not in tigger_alleles:
            glutils.remove_gene(glfo, gene)

    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Exemple #22
0
def cache_parameters():
    if utils.output_exists(args,
                           ifname('params'),
                           outlabel='parameters',
                           offset=4):
        return
    cmd = './bin/partis cache-parameters --seed %d --no-indels' % args.seed  # forbid indels because in the very rare cases when we call them, they're always wrong, and then they screw up the simultaneous true clonal seqs option
    fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else ' --infname %s --parameter-dir %s'
    cmd += fstr % (spath('mutated'), ipath('params'))
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    if args.n_max_queries is not None:
        cmd += ' --n-max-queries %d' % args.n_max_queries
    utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
Exemple #23
0
def run_igblast(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    if args.glfo_dir is not None:
        print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color('red', 'warning')

    cmd = './igblastn'
    cmd += ' -germline_db_V human_gl_V -germline_db_D human_gl_V -germline_db_J human_gl_J'
    cmd += ' -auxiliary_data optional_file/human_gl.aux'
    cmd += ' -domain_system imgt -ig_seqtype Ig -organism human -outfmt \'7 std qseq sseq btop\''
    cmd += ' -num_threads %d' % args.n_procs
    cmd += ' -query ' + infname + ' -out ' + outfname
    
    cmd = 'cd %s; %s' % (args.igbdir, cmd)
    utils.simplerun(cmd, shell=True, print_time='igblast')
Exemple #24
0
def run_performance_plot(args, method):
    perf_outdir = get_outfname(args, method, annotation_performance_plots=True)
    if utils.output_exists(args, perf_outdir):
        return

    cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --plot-annotation-performance'
    cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation'
    cmd_str += ' --initial-germline-dir ' + get_outfname(args, method, return_parent_gl_dir=True)  # i.e. use the inferred glfo from <method>
    cmd_str += ' --parameter-dir ' + perf_outdir + '/dummy-parameter-dir'
    cmd_str += ' --only-overall-plots --plotdir ' + perf_outdir
    cmd_str += ' --only-smith-waterman --leave-default-germline --dont-write-parameters'  # i.e. we really want to annotate, not cache parameters, but then it'd look for a parameter dir
    cmd_str += ' --n-procs ' + str(args.n_procs)
    if args.n_max_queries is not None:
        cmd_str += ' --n-max-queries ' + str(args.n_max_queries)  # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution
    if args.slurm:
        cmd_str += ' --batch-system slurm'
    if args.seed is not None:
        cmd_str += ' --seed ' + str(args.seed)
    utils.simplerun(cmd_str, dryrun=args.dry_run)
Exemple #25
0
def rearrange():
    if utils.output_exists(
            args, naive_fname('igh'), outlabel='naive simu', offset=4
    ):  # just look for the merged igh file, since it's about the last to be written (and both paired subdirs may not be there)
        return
    cmd = './bin/partis simulate --simulate-from-scratch --mutation-multiplier 0.0001 --n-leaves 1 --constant-number-of-leaves'  # tends to get in infinite loop if you actually pass 0. (yes, I should fix this)
    cmd += ' --debug %d --seed %d --n-sim-events %d' % (int(
        args.debug), args.seed, args.n_sim_events)
    if args.paired_loci:
        cmd += ' --paired-loci --paired-outdir %s' % spath('naive')
    else:
        cmd += ' --outfname %s' % spath('naive')
    if args.restrict_available_genes:
        assert not args.paired_loci
        cmd += ' --only-genes IGHV1-18*01:IGHJ1*01'
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    utils.simplerun(cmd, dryrun=args.dry_run, debug=True)
Exemple #26
0
def simulate():

    rearrange()

    glfo, naive_event_list, cpath = utils.read_output(naive_fname())
    assert len(naive_event_list) == args.n_sim_events

    outdirs = ['%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))]

    start = time.time()
    cmdfos = []
    if args.n_procs > 1:
        print '    starting %d events' % len(naive_event_list)
    uid_str_len = 6 + int(math.log(len(naive_event_list), 10))  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        if args.n_sim_events > 1 and args.n_procs == 1:
            print '  %s %d' % (utils.color('blue', 'ievent'), ievent)
        cfo = run_bcr_phylo(naive_line, outdir, ievent, len(naive_event_list), uid_str_len=uid_str_len)  # if n_procs > 1, doesn't run, just returns cfo
        if cfo is not None:
            print '      %s %s' % (utils.color('red', 'run'), cfo['cmd_str'])
            cmdfos.append(cfo)
    if args.n_procs > 1 and len(cmdfos) > 0:
        utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print')
    print '  bcr-phylo run time: %.1fs' % (time.time() - start)

    if utils.output_exists(args, simfname(), outlabel='mutated simu', offset=4):  # i guess if it crashes during the plotting just below, this'll get confused
        return

    start = time.time()
    mutated_events = []
    for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)):
        mutated_events.append(parse_bcr_phylo_output(glfo, naive_line, outdir, ievent))
    print '  parsing time: %.1fs' % (time.time() - start)

    print '  writing annotations to %s' % simfname()
    utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers)

    if not args.only_csv_plots:
        import lbplotting
        for outdir, event in zip(outdirs, mutated_events):
            lbplotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance])
Exemple #27
0
def run_igblast(infname, outfname):
    if utils.output_exists(args, outfname, offset=8):
        return

    if args.glfo_dir is not None:
        print '%s --glfo-dir isn\'t getting plugged in to igblast/changeo (would need to rebuild igblast db)' % utils.color(
            'red', 'warning')

    if args.n_random_queries is not None:
        sub_infname = os.path.dirname(outfname) + '/' + os.path.basename(
            infname.replace(
                utils.getsuffix(infname), '-n-random-queries-%d%s' %
                (args.n_random_queries, utils.getsuffix(infname))))
        if os.path.exists(sub_infname):
            print '    --n-random-queries: leaving existing fasta for igblast (hopefully it has %d queries)' % args.n_random_queries
        else:
            print '    --n-random-queries: writing new fasta for igblast (%d queries)' % args.n_random_queries
            seqfos = utils.read_fastx(infname,
                                      n_random_queries=args.n_random_queries)
            with open(sub_infname, 'w') as sub_infile:
                for seqfo in seqfos:
                    sub_infile.write('>%s\n%s\n' %
                                     (seqfo['name'], seqfo['seq']))
        infname = sub_infname

    cmds = ['#!/bin/bash']
    cmds += ['cd %s/%s' % (args.igbdir, args.locus)]
    cmds += ['export PATH=%s:$PATH' % args.condapath]
    cmds += ['igblastn']
    for tmpreg in utils.regions:
        cmds[-1] += ' -germline_db_%s %s%s-unaligned.fasta' % (
            tmpreg.upper(), args.locus, tmpreg)
    cmds[-1] += ' -auxiliary_data optional_file/%s_gl.aux' % args.species
    cmds[
        -1] += ' -domain_system imgt -ig_seqtype Ig -organism %s -outfmt \'7 std qseq sseq btop\'' % args.species
    cmds[-1] += ' -num_threads %d' % utils.auto_n_procs()
    cmds[-1] += ' -query ' + infname + ' -out ' + outfname
    utils.simplerun('\n'.join(cmds) + '\n', cmdfname=args.workdir + '/run.sh')
Exemple #28
0
def partition():
    if utils.output_exists(args,
                           ifname('partition'),
                           outlabel='partition',
                           offset=4):
        return
    cmd = './bin/partis partition --simultaneous-true-clonal-seqs --is-simu --seed %d' % args.seed
    fstr = ' --paired-loci --paired-indir %s --paired-outdir %s' if args.paired_loci else (
        ' --infname %%s --parameter-dir %s --outfname %%s' % ipath('params'))
    cmd += fstr % (spath('mutated'), ipath('partition'))
    #  --write-additional-cluster-annotations 0:5  # I don't think there was really a good reason for having this
    if not args.dont_get_tree_metrics:
        cmd += ' --get-selection-metrics --plotdir %s' % (
            'paired-outdir' if args.paired_loci else ipath('plots'))
    if args.lb_tau is not None:
        cmd += ' --lb-tau %f' % args.lb_tau
    if args.n_procs > 1:
        cmd += ' --n-procs %d' % args.n_procs
    if args.slurm:
        cmd += ' --batch-system slurm'
    if args.n_max_queries is not None:
        cmd += ' --n-max-queries %d' % args.n_max_queries
    utils.simplerun(cmd, debug=True, dryrun=args.dry_run)
Exemple #29
0
def run_tigger(infname, outfname, outdir):
    if utils.output_exists(args, outfname, offset=8):
        return

    rcmds = [
        'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)',
        'library(dplyr, warn.conflicts=FALSE)'
    ]
    # rcmds += ['data(sample_db, germline_ighv)']

    db_name = 'annotations'
    gls_name = 'gls'
    rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)]
    rcmds += [
        '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))
    ]

    tigger_outfname = outdir + '/tigger.fasta'
    find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name,
                                              utils.auto_n_procs())
    if args.tuned_tigger_params:
        germline_min = 5  # only analyze genes which correspond to at least this many V calls (default 200)
        min_seqs = 5  # minimum number of total sequences
        j_max = 0.95  # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15)
        find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % (
            germline_min, min_seqs, j_max)
    rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr]
    # rcmds += ['sessionInfo()']
    rcmds += ['print(novel_df)']
    rcmds += [
        'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)'
        % (db_name, gls_name)
    ]
    rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)]
    rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname]
    cmdfname = args.workdir + '/tigger-in.cmd'
    with open(cmdfname, 'w') as cmdfile:
        cmdfile.write('\n'.join(rcmds) + '\n')
    cmdstr = 'R --slave -f ' + cmdfname

    cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ}
    proc = utils.run_cmd(cmdfo)
    while proc.poll() is None:
        time.sleep(0.01)
    if proc.returncode != 0:  # damn thing crashes if it thinks the sample size is small
        with open(args.workdir + '/err') as ferr:
            errstr = ''.join(ferr.readlines())
        if 'Not enough sample sequences were assigned to any germline' in errstr:
            with open(tigger_outfname, 'w') as dummy_outfasta:
                dummy_outfasta.write('')
        else:
            subprocess.check_call(['cat', args.workdir + '/out'])
            subprocess.check_call(['cat', args.workdir + '/err'])
            sys.exit(proc.returncode)

    for oe in ['err', 'out']:
        with open(args.workdir + '/' + oe) as oefile:
            print ''.join(oefile.readlines())
        os.remove(args.workdir + '/' + oe)

    # post-process tigger .fa
    template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human'
    glfo = glutils.create_glfo_from_fasta(
        tigger_outfname,
        args.locus,
        args.region,
        template_gldir,
        simulation_germline_dir=args.simulation_germline_dir)
    out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus)
    assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname
    glutils.write_glfo(out_gldir, glfo)

    os.remove(cmdfname)
Exemple #30
0
    '--droplet-id-separator',
    default='_',
    help=
    'everything in the sequence id before this character is treated as the droplet id, e.g. for the default, the uid AAACGGGCAAGCGAGT-1_contig_2 has a droplet id of AAACGGGCAAGCGAGT-1'
)
parser.add_argument('--overwrite', action='store_true')
parser.add_argument(
    '--n-max-queries',
    type=int,
    default=-1,
    help=
    'Maximum number of query sequences to read from input file, starting from beginning of file'
)
args = parser.parse_args()

if utils.output_exists(args, args.outfname, offset=4, debug=False):
    print '  extract-pairing-info.py output exists and --overwrite was not set, so not doing anything: %s' % args.outfname
    sys.exit(0)

seqfos = utils.read_fastx(args.infname, n_max_queries=args.n_max_queries)
droplet_ids = {}
for sfo in seqfos:
    did = utils.get_droplet_id(sfo['name'])
    if did not in droplet_ids:
        droplet_ids[did] = []
    droplet_ids[did].append(sfo['name'])

print '  read %d sequences with %d droplet ids' % (len(seqfos),
                                                   len(droplet_ids))
count_info = {}
for dlist in droplet_ids.values():