Example #1
0
def make_gls_tree_plot(args, region, plotdir, plotname, glsfnames, glslabels, locus, ref_label=None, title=None, title_color=None, legends=None, legend_title=None, pie_chart_faces=False, param_dirs=None):
    raise Exception('needs to be tested for switch to utils.run_ete_script() (should work ok, I just don\'t want to run it now)')
    cmdstr = './bin/plot-gl-set-trees.py'
    cmdstr += ' --plotdir ' + plotdir
    cmdstr += ' --plotname ' + plotname
    cmdstr += ' --glsfnames ' + ':'.join(glsfnames)
    cmdstr += ' --glslabels ' + ':'.join(glslabels)
    cmdstr += ' --region ' + region
    if ref_label is not None:
        cmdstr += ' --ref-label ' + ref_label
    if title is not None:
        cmdstr += ' --title="%s"' % title
    if title_color is not None:
        cmdstr += ' --title-color %s' % title_color
    if legends is not None:
        cmdstr += ' --legends=' + ':'.join('"%s"' % l for l in legends)
    if legend_title is not None:
        cmdstr += ' --legend-title="%s"' % legend_title
    if pie_chart_faces:
        cmdstr += ' --pie-chart-faces'
    if param_dirs is not None:
        cmdstr += ' --param-dirs %s' % ':'.join(param_dirs)
    cmdstr += ' --locus ' + locus
    if args.plotcache:
        cmdstr += ' --use-cache'
    if args.only_print:
        cmdstr += ' --only-print'
    utils.run_ete_script(cmdstr, args.ete_path, debug=args.dryrun, dryrun=args.dryrun, extra_str='        ')
Example #2
0
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events, uid_str_len=None):
    if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4):
        return None

    cmd = '%s/bin/simulator.py' % bcr_phylo_path
    if args.run_help:
        cmd += ' --help'
    elif args.stype == 'neutral':
        assert False  # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args)
        cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365)
        cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation
    elif args.stype == 'selection':
        cmd += ' --selection'
        cmd += ' --lambda %f' % args.branching_parameter
        cmd += ' --lambda0 %f' % args.base_mutation_rate
        cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength)
        cmd += ' --obs_times %s' % ' '.join(['%d' % get_vpar_val('obs-times', t) for t in args.obs_times])
        cmd += ' --n_to_sample %s' % ' '.join('%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation)
        cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
        if args.paratope_positions is not None:
            cmd += ' --paratope_positions %s' % args.paratope_positions
        cmd += ' --target_dist %d' % args.target_distance
        cmd += ' --target_count %d' % args.target_count
        cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
        if not args.dont_observe_common_ancestors:
            cmd += ' --observe_common_ancestors'
        if args.leaf_sampling_scheme is not None:
            cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme
        if args.n_target_clusters is not None:
            cmd += ' --n_target_clusters %d' % args.n_target_clusters
        # cmd += ' --target_cluster_distance 1'
        if args.min_target_distance is not None:
            cmd += ' --min_target_distance %d' % args.min_target_distance
    else:
        assert False

    cmd += ' --debug %d' % args.debug
    cmd += ' --n_tries 1000'
    if args.context_depend == 0:
        cmd += ' --no_context'
    cmd += ' --no_plot'
    if args.only_csv_plots:
        cmd += ' --dont_write_hists'
    cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
    cmd += ' --random_seed %d' % (args.seed + ievent)
    if uid_str_len is not None:
        cmd += ' --uid_str_len %d' % uid_str_len
    cmd += ' --naive_seq %s' % naive_line['naive_seq']

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    cfo = None
    if args.n_procs == 1:
        utils.run_ete_script(cmd, ete_path)  # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
    else:
        cmd, _ = utils.run_ete_script(cmd, ete_path, return_for_cmdfos=True, tmpdir=outdir)
        cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)}
    return cfo
Example #3
0
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events):
    if utils.output_exists(args,
                           bcr_phylo_fasta_fname(outdir),
                           outlabel='bcr-phylo',
                           offset=4):
        return

    cmd = '%s/bin/simulator.py' % bcr_phylo_path
    if args.run_help:
        cmd += ' --help'
    elif args.stype == 'neutral':
        assert False  # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args)
        cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365)
        cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation
    elif args.stype == 'selection':
        cmd += ' --selection'
        cmd += ' --lambda %f' % args.branching_parameter
        cmd += ' --lambda0 %f' % args.base_mutation_rate
        cmd += ' --selection_strength %f' % get_vpar_val(
            'selection-strength', args.selection_strength)
        cmd += ' --obs_times %s' % ' '.join(
            ['%d' % get_vpar_val('obs-times', t) for t in args.obs_times])
        cmd += ' --n_to_sample %s' % ' '.join(
            '%d' % get_vpar_val('n-sim-seqs-per-generation', n)
            for n in args.n_sim_seqs_per_generation)
        cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance
        cmd += ' --target_dist %d' % args.target_distance
        cmd += ' --target_count %d' % args.target_count
        cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap)
        if not args.dont_observe_common_ancestors:
            cmd += ' --observe_common_ancestors'

        # cmd += ' --n_target_clusters 1'
        # cmd += ' --target_cluster_distance 1'

        # cmd += ' --observe_based_on_affinity'  # implementation in bcr-phylo needs some work
    else:
        assert False

    cmd += ' --debug %d' % args.debug
    cmd += ' --n_tries 30'
    cmd += ' --no_context'
    cmd += ' --no_plot'
    cmd += ' --outbase %s/%s' % (outdir, args.extrastr)
    cmd += ' --random_seed %d' % (args.seed + ievent)
    if n_total_events > 1:  # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates
        cmd += ' --uid_str_len 7'
    cmd += ' --naive_seq %s' % naive_line['naive_seq']

    if not os.path.exists(outdir):
        os.makedirs(outdir)

    utils.run_ete_script(
        cmd, ete_path
    )  # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
Example #4
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Example #5
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Example #6
0
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent):
    # ----------------------------------------------------------------------------------------
    def split_seqfos(seqfos):
        hline, lline = naive_events[ievent]
        hseqfos, lseqfos = [], []
        for sfo in seqfos:
            padseq = utils.pad_nuc_seq(hline['naive_seq'])
            assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq'])
            hseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][:len(hline['naive_seq'])]
            })
            lseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][len(padseq):]
            })
        return hseqfos, lseqfos

    # ----------------------------------------------------------------------------------------
    def read_kdvals(kdfname):
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        return nodefo

    # ----------------------------------------------------------------------------------------
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line

    # ----------------------------------------------------------------------------------------
    assert args.stype == 'selection'  # i don't know that non-'selection' is possible or has any point at this point (can just set selection strength to zero)
    kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
    if not utils.output_exists(
            args, kdfname, outlabel='kd/nwk conversion', offset=4
    ):  # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (
            outdir, args.extrastr, kdfname, nwkfname)
        utils.run_ete_script(cmd, ete_path, debug=args.n_procs == 1)
    nodefo = read_kdvals(kdfname)
    dtree = treeutils.get_dendro_tree(treefname=nwkfname)
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    if args.paired_loci:
        mevents = []
        for tline, sfos, tsfos in zip(naive_events[ievent],
                                      split_seqfos(seqfos),
                                      split_seqfos(target_seqfos)):
            mevents.append(
                get_mature_line(sfos,
                                tline,
                                glfos[tline['loci'][0]],
                                nodefo,
                                dtree,
                                target_seqfos,
                                locus=tline['loci'][0]))
        return mevents
    else:
        return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo,
                               dtree, target_seqfos)