Esempio n. 1
0
    def read_input_tree_file(self, outfname):
        if self.args.debug:
            print '  reading trees from %s' % self.args.input_simulation_treefname
        utils.simplerun('cp %s %s' %
                        (self.args.input_simulation_treefname, outfname),
                        debug=False)
        ages, treestrs = [], []
        with open(outfname) as treefile:
            for line in treefile:
                tstr = line.strip()
                if tstr == '':  # skip empty lines
                    continue
                dtree = treeutils.get_dendro_tree(
                    treestr=tstr, suppress_internal_node_taxa=True)
                if dtree.seed_node.edge_length is None:  # make sure root edge length is set (otherwise bppseqgen barfs)
                    dtree.seed_node.edge_length = 0.
                old_new_label_pairs = [
                    (l.taxon.label, 't%d' % (i + 1))
                    for i, l in enumerate(dtree.leaf_node_iter())
                ]
                treeutils.translate_labels(
                    dtree, old_new_label_pairs
                )  # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this  to check that e.g. bppseqgen didn't screw up the ordering)
                age = self.choose_full_sequence_branch_length()
                if self.args.debug > 1:  # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height
                    print '    input tree %d (rescaled depth %.3f --> %.3f):' % (
                        len(ages), treeutils.get_mean_leaf_height(tree=dtree),
                        age)
                treeutils.rescale_tree(
                    age, dtree=dtree
                )  # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now
                ages.append(age)
                treestrs.append(dtree.as_string(schema='newick').strip())
                if self.args.debug > 1:
                    print utils.pad_lines(treeutils.get_ascii_tree(dtree))
        if any(a > 1. for a in ages):
            raise Exception(
                'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s'
                % (self.args.input_simulation_treefname, ages))
        if len(ages) != self.args.n_trees:
            print '    resetting --n-trees from %d to %d to match trees read from %s' % (
                self.args.n_trees, len(ages),
                self.args.input_simulation_treefname)
        self.args.n_trees = len(ages)

        return ages, treestrs
Esempio n. 2
0
    def generate_trees(self, seed, outfname, workdir):
        if self.args.input_simulation_treefname is None:  # default: generate our own trees
            ages, treestrs = self.run_treesim(seed, outfname, workdir)
        else:  # read trees from a file that pass set on the command line
            ages, treestrs = self.read_input_tree_file(outfname)
        os.remove(
            outfname
        )  # remove it here, just to make clear that we *re*write it in self.post_process_trees() so that recombinator can later read it

        if self.args.debug:
            mean_leaf_height_list = [
                treeutils.get_mean_leaf_height(treestr=tstr)
                for tstr in treestrs
            ]
            n_leaf_list = [
                treeutils.get_n_leaves(
                    treeutils.get_dendro_tree(
                        treestr=tstr, suppress_internal_node_taxa=True))
                for tstr in treestrs
            ]
            print '    mean over %d trees:   depth %.5f   leaves %.2f' % (
                len(mean_leaf_height_list), numpy.mean(mean_leaf_height_list),
                numpy.mean(n_leaf_list))

        # Each tree is written with branch length the mean branch length over the whole sequence, so we need to add the length for each region afterward,
        #   so each line looks e.g. like (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87

        # add the region-specific branch info as an extra string tacked onto the right of the newick tree (so the output file isn't newick any more, sigh)
        length_list = [
            '%s:%f' % (region, self.branch_lengths[region]['mean'] /
                       self.branch_lengths['all']['mean'])
            for region in utils.regions
        ]
        for itree in range(len(ages)):
            if treestrs[itree].count(';') != 1 or treestrs[itree][-1] != ';':
                raise Exception('malformatted newick string:\n  %s' %
                                treestrs[itree])
            treestrs[itree] = treestrs[itree].replace(
                ';', ';%s' % ','.join(length_list))

        # then write the modified lines for recombinator to read
        with open(outfname, 'w') as treefile:
            for tstr in treestrs:
                treefile.write('%s\n' % tstr)
Esempio n. 3
0
    def generate_trees(self, seed, outfname, workdir):
        if self.args.input_simulation_treefname is None:  # default: generate our own trees
            ages, treestrs = self.run_treesim(seed, outfname, workdir)
        else:  # read trees from a file that pass set on the command line
            ages, treestrs = self.read_input_tree_file(outfname)
        os.remove(
            outfname
        )  # remove it here, just to make clear that we *re*write it in self.post_process_trees() so that recombinator can later read it

        if self.args.debug or utils.getsuffix(outfname) == '.nwk':
            dtreelist = [
                treeutils.get_dendro_tree(treestr=tstr,
                                          suppress_internal_node_taxa=True)
                for tstr in treestrs
            ]
            mean_leaf_height_list = [
                treeutils.get_mean_leaf_height(tree=dt) for dt in dtreelist
            ]
            n_leaf_list = [treeutils.get_n_leaves(dt) for dt in dtreelist]
            print '    mean over %d trees:   depth %.5f   leaves %.2f' % (
                len(mean_leaf_height_list), numpy.mean(mean_leaf_height_list),
                numpy.mean(n_leaf_list))

        # each tree is written with branch length the mean branch length over the whole sequence (which is different for each tree), but recombinator also needs the relative length for each region (which is the same, it's an average over the whole repertoire)
        with open(outfname, 'w') as yfile:
            if utils.getsuffix(outfname) == '.yaml':
                yamlfo = {
                    'branch-length-ratios': {
                        r: self.branch_lengths[r]['mean'] /
                        self.branch_lengths['all']['mean']
                        for r in utils.regions
                    },
                    'trees': treestrs
                }
                json.dump(yamlfo, yfile)
            elif utils.getsuffix(outfname) == '.nwk':
                print '    writing trees to %s' % outfname
                for treestr in treestrs:
                    yfile.write(treestr + '\n')
            else:
                assert False
Esempio n. 4
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            outdir, args.extrastr, outdir, outdir)
        utils.run_ete_script(cmd, ete_path)
        nodefo = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Esempio n. 5
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(
        '%s/%s.fasta' %
        (outdir, args.extrastr))  # output mutated sequences from bcr-phylo

    assert len(
        naive_line['unique_ids']
    ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(
        naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [
            sfo['seq']
        ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        reco_info[sfo['name']] = mline
        utils.add_implicit_info(glfo, mline)
    final_line = utils.synthesize_multi_seq_line_from_reco_info(
        [sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % (
            ete_path, outdir, args.extrastr, outdir, outdir)
        utils.simplerun(cmd, shell=True)
        kdvals = {}
        with open('%s/kd-vals.csv' % outdir) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                kdvals[line['uid']] = float(line['kd'])
        if len(
                set(kdvals) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(kdvals) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(kdvals)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(kdvals))
        final_line['affinities'] = [
            1. / kdvals[u] for u in final_line['unique_ids']
        ]
        tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir)
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree),
                                  padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(
        glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(
        final_line, irandom=ievent
    )  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]
    from Bio.Seq import Seq
    final_line['nearest_target_indices'] = []
    aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']]
    for mseq in final_line['input_seqs']:
        aa_mseq = Seq(mseq).translate()
        aa_hdists = [
            utils.hamming_distance(aa_t, aa_mseq, amino_acid=True)
            for aa_t in aa_targets
        ]
        imin = aa_hdists.index(
            min(aa_hdists)
        )  # NOTE doesn't do anything differently if there's more than one min
        final_line['nearest_target_indices'].append(imin)

    return final_line
Esempio n. 6
0
import utils
import treeutils

class MultiplyInheritedFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter):
    pass
formatter_class = MultiplyInheritedFormatter
parser = argparse.ArgumentParser(formatter_class=MultiplyInheritedFormatter)
parser.add_argument('infname', help='input newick tree file')
parser.add_argument('outfname', help='output yaml file')
parser.add_argument('--input-metafname', help='optional input yaml meta file with multiplicity information. Example: yaml.dump({n.taxon.label : {\'multiplicity\' : 1} for n in dtree.postorder_node_iter()}, tfile, width=150)')
parser.add_argument('--debug', action='store_true')
args = parser.parse_args()


if not os.path.exists(args.infname):
    raise Exception('infname %s does not exist' % args.infname)
with open(args.infname) as infile:
    dtree = treeutils.get_dendro_tree(treefname=args.infname, debug=args.debug)

input_metafo = None
if args.input_metafname is not None:
    if not os.path.exists(args.input_metafname):
        raise Exception('--input-metafname %s does not exist' % args.input_metafname)
    with open(args.input_metafname) as imfile:
        input_metafo = yaml.load(imfile, Loader=yaml.Loader)

lbvals = treeutils.calculate_lb_values(dtree, input_metafo=input_metafo, use_multiplicities=(input_metafo is not None), debug=args.debug)

with open(args.outfname, 'w') as outfile:
    yaml.dump(lbvals, outfile, width=200)
Esempio n. 7
0
 def set_tree(self, treestr):
     self.tree = treeutils.get_dendro_tree(treestr=treestr)
Esempio n. 8
0
    def add_mutants(self, reco_event, irandom):
        if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.:  # some of the stuff below fails if mut mult is actually 0.
            reco_event.final_seqs.append(reco_event.recombined_seq)  # set final sequnce in reco_event
            reco_event.indelfos = [indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs))]
            return

        # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data)
        # This chosen depth corresponds to the sequence-wide mutation frequency.
        # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region.
        # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file).
        # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence  (i.e. the weighted mean of v, d, and j)
        # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file
        treefostr = self.treeinfo[random.randint(0, len(self.treeinfo)-1)]  # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok.
        assert treefostr.count(';') == 1
        isplit = treefostr.find(';') + 1
        chosen_tree = treefostr[:isplit]  # includes semi-colon
        reco_event.set_tree(chosen_tree)  # leaf names are still just like t<n>
        mutefo = [rstr for rstr in treefostr[isplit:].split(',')]
        mean_total_height = treeutils.get_mean_leaf_height(treestr=chosen_tree)
        regional_heights = {}  # per-region height, including <self.args.mutation_multiplier>
        for tmpstr in mutefo:
            region, ratio = tmpstr.split(':')
            assert region in utils.regions
            ratio = float(ratio)
            if self.args.mutation_multiplier is not None:  # multiply the branch lengths by some factor
                ratio *= self.args.mutation_multiplier
            regional_heights[region] = mean_total_height * ratio

        scaled_trees = {r : treeutils.rescale_tree(regional_heights[r], treestr=chosen_tree) for r in utils.regions}

        if self.args.debug:
            print '  chose tree with total height %f' % treeutils.get_mean_leaf_height(treestr=chosen_tree)
            print '    regional trees rescaled to heights:  %s' % ('   '.join(['%s %.3f  (expected %.3f)' % (region, treeutils.get_mean_leaf_height(treestr=scaled_trees[region]), regional_heights[region]) for region in utils.regions]))

        n_leaves = treeutils.get_n_leaves(treeutils.get_dendro_tree(treestr=chosen_tree, schema='newick'))
        cmdfos = []
        regional_naive_seqs = {}  # only used for tree checking
        for region in utils.regions:
            simstr = reco_event.eroded_seqs[region]
            if region == 'd':
                simstr = reco_event.insertions['vd'] + simstr + reco_event.insertions['dj']
            cmdfos.append(self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom))
            regional_naive_seqs[region] = simstr

        utils.run_cmds([cfo for cfo in cmdfos if cfo is not None], sleep=False)  # shenanigan is to handle zero-length regional seqs

        mseqs = {}
        for ireg in range(len(utils.regions)):  # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.)
            if cmdfos[ireg] is None:
                mseqs[utils.regions[ireg]] = ['' for _ in range(n_leaves)]  # return an empty string for each leaf node
            else:
                tmp_names, tmp_seqs = self.read_bppseqgen_output(cmdfos[ireg], n_leaves)
                if reco_event.leaf_names is None:
                    reco_event.leaf_names = tmp_names
                assert reco_event.leaf_names == tmp_names  # enforce different regions having same name + ordering (although this is already enforced when reading bppseqgen output)
                mseqs[utils.regions[ireg]] = tmp_seqs

        assert len(reco_event.final_seqs) == 0

        for iseq in range(n_leaves):
            seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq]
            seq = reco_event.revert_conserved_codons(seq, debug=self.args.debug)  # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with
            reco_event.final_seqs.append(seq)  # set final sequnce in reco_event
            reco_event.final_codon_positions.append(copy.deepcopy(reco_event.post_erosion_codon_positions))  # separate codon positions for each sequence, because of shm indels

        self.add_shm_indels(reco_event)
        reco_event.setline(irandom)  # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow
        # self.check_tree_simulation(mean_total_height, regional_heights, chosen_tree, scaled_trees, regional_naive_seqs, mseqs, reco_event)
        # self.print_validation_values()

        if self.args.debug:
            print '    tree passed to bppseqgen:'
            print treeutils.get_ascii_tree(dendro_tree=reco_event.tree, extra_str='      ')
            utils.print_reco_event(reco_event.line, extra_str='    ')
Esempio n. 9
0
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent):
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir))  # output mutated sequences from bcr-phylo

    assert len(naive_line['unique_ids']) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
    assert not indelutils.has_indels(naive_line['indelfos'][0])  # would have to handle this below
    if args.debug:
        utils.print_reco_event(naive_line)
    reco_info = collections.OrderedDict()
    for sfo in seqfos:
        mline = copy.deepcopy(naive_line)
        utils.remove_all_implicit_info(mline)
        del mline['tree']
        mline['unique_ids'] = [sfo['name']]
        mline['seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['input_seqs'] = [sfo['seq']]  # it's really important to set both the seqs (since they're both already in there from the naive line)
        mline['duplicates'] = [[]]
        reco_info[sfo['name']] = mline
        try:
            utils.add_implicit_info(glfo, mline)
        except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
            print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir)
            lines = traceback.format_exception(*sys.exc_info())
            print utils.pad_lines(''.join(lines))  # NOTE this will still crash on the next line if implicit info adding failed
    final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info)
    if args.debug:
        utils.print_reco_event(final_line)

    # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
    if args.stype == 'selection':
        kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
        if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4):  # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
            cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname)
            utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1)
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd' : float(line['kd']),
                    'relative_kd' : float(line['relative_kd']),
                    'lambda' : line.get('lambda', None),
                    'target_index' : int(line['target_index']),
                }
        if len(set(nodefo) - set(final_line['unique_ids'])) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']]
        final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']]
        final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']]
        final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']]
        tree = treeutils.get_dendro_tree(treefname=nwkfname)
        tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12)
        final_line['tree'] = tree.as_string(schema='newick')
    tmp_event = RecombinationEvent(glfo)  # I don't want to move the function out of event.py right now
    tmp_event.set_reco_id(final_line, irandom=ievent)  # not sure that setting <irandom> here actually does anything

    # get target sequences
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr))
    final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos]

    return final_line
Esempio n. 10
0
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent):
    # ----------------------------------------------------------------------------------------
    def split_seqfos(seqfos):
        hline, lline = naive_events[ievent]
        hseqfos, lseqfos = [], []
        for sfo in seqfos:
            padseq = utils.pad_nuc_seq(hline['naive_seq'])
            assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq'])
            hseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][:len(hline['naive_seq'])]
            })
            lseqfos.append({
                'name': sfo['name'],
                'seq': sfo['seq'][len(padseq):]
            })
        return hseqfos, lseqfos

    # ----------------------------------------------------------------------------------------
    def read_kdvals(kdfname):
        nodefo = {}
        with open(kdfname) as kdfile:
            reader = csv.DictReader(kdfile)
            for line in reader:
                nodefo[line['uid']] = {
                    'kd': float(line['kd']),
                    'relative_kd': float(line['relative_kd']),
                    'lambda': line.get('lambda', None),
                    'target_index': int(line['target_index']),
                }
        return nodefo

    # ----------------------------------------------------------------------------------------
    def get_mature_line(sfos,
                        naive_line,
                        glfo,
                        nodefo,
                        dtree,
                        target_sfos,
                        locus=None):
        assert len(
            naive_line['unique_ids']
        ) == 1  # enforces that we ran naive-only, 1-leaf partis simulation above
        assert not indelutils.has_indels(
            naive_line['indelfos'][0])  # would have to handle this below
        if args.debug:
            utils.print_reco_event(naive_line)
        reco_info = collections.OrderedDict()
        for sfo in sfos:
            mline = utils.get_non_implicit_copy(naive_line)
            del mline['tree']
            mline['unique_ids'] = [sfo['name']]
            mline['seqs'] = [sfo['seq']]
            mline['input_seqs'] = [
                sfo['seq']
            ]  # it's really important to set both the seqs (since they're both already in there from the naive line)
            mline['duplicates'] = [[]]
            reco_info[sfo['name']] = mline
            try:
                utils.add_implicit_info(glfo, mline)
            except:  # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file
                print 'implicit info adding failed for ievent %d in %s' % (
                    ievent, outdir)
                lines = traceback.format_exception(*sys.exc_info())
                print utils.pad_lines(
                    ''.join(lines)
                )  # NOTE this will still crash on the next line if implicit info adding failed
        final_line = utils.synthesize_multi_seq_line_from_reco_info(
            [sfo['name'] for sfo in sfos], reco_info)

        ftree = copy.deepcopy(dtree)
        if locus is not None:

            def ltr(u):
                return u + '-' + locus

            new_nodefo = {}
            for u_old in nodefo:
                new_nodefo[ltr(u_old)] = nodefo[u_old]
            nodefo = new_nodefo
            treeutils.translate_labels(ftree,
                                       [(u, ltr(u))
                                        for u in final_line['unique_ids']])
            final_line['unique_ids'] = [
                ltr(u) for u in final_line['unique_ids']
            ]
            assert len(sfos) == len(final_line['unique_ids'])
            for iseq, sfo in enumerate(sfos):
                naive_id = naive_line['unique_ids'][0]
                assert naive_id.count('-') == 1
                bstr = naive_id.replace('-' + locus, '')
                pids = final_line['paired-uids'][iseq]
                assert len(pids) == 1 and pids[0].find(
                    bstr
                ) == 0 and pids[0].count('-') == 1 and pids[0].split(
                    '-'
                )[1] in utils.loci  # if uid is xxx-igh, paired id shoud be e.g. xxx-igk
                final_line['paired-uids'][iseq] = [
                    p.replace(bstr, sfo['name']) for p in pids
                ]

        if args.debug:
            utils.print_reco_event(final_line)

        # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read)
        if len(
                set(nodefo) - set(final_line['unique_ids'])
        ) > 0:  # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes
            print '        in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (
                set(nodefo) - set(final_line['unique_ids']))
        if len(set(final_line['unique_ids']) - set(nodefo)) > 0:
            print '        in final_line, but missing from kdvals: %s' % ' '.join(
                set(final_line['unique_ids']) - set(nodefo))
        final_line['affinities'] = [
            1. / nodefo[u]['kd'] for u in final_line['unique_ids']
        ]
        final_line['relative_affinities'] = [
            1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']
        ]
        final_line['lambdas'] = [
            nodefo[u]['lambda'] for u in final_line['unique_ids']
        ]
        final_line['nearest_target_indices'] = [
            nodefo[u]['target_index'] for u in final_line['unique_ids']
        ]
        ftree.scale_edges(1. / numpy.mean([len(s)
                                           for s in final_line['seqs']]))
        if args.debug:
            print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree),
                                  padwidth=12)
        final_line['tree'] = ftree.as_string(schema='newick')

        tmp_event = RecombinationEvent(
            glfo
        )  # I don't want to move the function out of event.py right now
        tmp_event.set_reco_id(
            final_line, irandom=ievent
        )  # not sure that setting <irandom> here actually does anything
        final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos]
        return final_line

    # ----------------------------------------------------------------------------------------
    assert args.stype == 'selection'  # i don't know that non-'selection' is possible or has any point at this point (can just set selection strength to zero)
    kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir
    if not utils.output_exists(
            args, kdfname, outlabel='kd/nwk conversion', offset=4
    ):  # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later
        cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (
            outdir, args.extrastr, kdfname, nwkfname)
        utils.run_ete_script(cmd, ete_path, debug=args.n_procs == 1)
    nodefo = read_kdvals(kdfname)
    dtree = treeutils.get_dendro_tree(treefname=nwkfname)
    seqfos = utils.read_fastx(bcr_phylo_fasta_fname(
        outdir))  # output mutated sequences from bcr-phylo
    target_seqfos = utils.read_fastx('%s/%s_targets.fa' %
                                     (outdir, args.extrastr))
    if args.paired_loci:
        mevents = []
        for tline, sfos, tsfos in zip(naive_events[ievent],
                                      split_seqfos(seqfos),
                                      split_seqfos(target_seqfos)):
            mevents.append(
                get_mature_line(sfos,
                                tline,
                                glfos[tline['loci'][0]],
                                nodefo,
                                dtree,
                                target_seqfos,
                                locus=tline['loci'][0]))
        return mevents
    else:
        return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo,
                               dtree, target_seqfos)