def read_input_tree_file(self, outfname): if self.args.debug: print ' reading trees from %s' % self.args.input_simulation_treefname utils.simplerun('cp %s %s' % (self.args.input_simulation_treefname, outfname), debug=False) ages, treestrs = [], [] with open(outfname) as treefile: for line in treefile: tstr = line.strip() if tstr == '': # skip empty lines continue dtree = treeutils.get_dendro_tree( treestr=tstr, suppress_internal_node_taxa=True) if dtree.seed_node.edge_length is None: # make sure root edge length is set (otherwise bppseqgen barfs) dtree.seed_node.edge_length = 0. old_new_label_pairs = [ (l.taxon.label, 't%d' % (i + 1)) for i, l in enumerate(dtree.leaf_node_iter()) ] treeutils.translate_labels( dtree, old_new_label_pairs ) # rename the leaves to t1, t2, etc. (it would be nice to not have to do this, but a bunch of stuff in recombinator uses this to check that e.g. bppseqgen didn't screw up the ordering) age = self.choose_full_sequence_branch_length() if self.args.debug > 1: # it's easier to keep this debug line separate up here than make a tmp variable to keep track of the old height print ' input tree %d (rescaled depth %.3f --> %.3f):' % ( len(ages), treeutils.get_mean_leaf_height(tree=dtree), age) treeutils.rescale_tree( age, dtree=dtree ) # I think this gets rescaled again for each event, so we could probably in principle avoid this rescaling, but if the input depth is greater than one stuff starts breaking, so may as well do it now ages.append(age) treestrs.append(dtree.as_string(schema='newick').strip()) if self.args.debug > 1: print utils.pad_lines(treeutils.get_ascii_tree(dtree)) if any(a > 1. for a in ages): raise Exception( 'tree depths must be less than 1., but trees read from %s don\'t satisfy this: %s' % (self.args.input_simulation_treefname, ages)) if len(ages) != self.args.n_trees: print ' resetting --n-trees from %d to %d to match trees read from %s' % ( self.args.n_trees, len(ages), self.args.input_simulation_treefname) self.args.n_trees = len(ages) return ages, treestrs
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx( '%s/%s.fasta' % (outdir, args.extrastr)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = 'export PATH=%s:$PATH && xvfb-run -a python ./bin/view-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( ete_path, outdir, args.extrastr, outdir, outdir) utils.simplerun(cmd, shell=True) kdvals = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: kdvals[line['uid']] = float(line['kd']) if len( set(kdvals) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(kdvals) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(kdvals)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(kdvals)) final_line['affinities'] = [ 1. / kdvals[u] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] from Bio.Seq import Seq final_line['nearest_target_indices'] = [] aa_targets = [Seq(seq).translate() for seq in final_line['target_seqs']] for mseq in final_line['input_seqs']: aa_mseq = Seq(mseq).translate() aa_hdists = [ utils.hamming_distance(aa_t, aa_mseq, amino_acid=True) for aa_t in aa_targets ] imin = aa_hdists.index( min(aa_hdists) ) # NOTE doesn't do anything differently if there's more than one min final_line['nearest_target_indices'].append(imin) return final_line
def make_single_tree(self, partitions, annotations, uid_set, get_fasttrees=False, n_max_cons_seqs=10, debug=False): # NOTE don't call this externally -- if you want a single tree, call make_trees() with <i_only_cluster> set def getline(uidstr, uid_set=None): if uidstr in annotations: # if we have this exact annotation return annotations[uidstr] else: if uid_set is None: uid_set = set(uidstr.split(':')) # should only get called if it's a singleton # note that for internal nodes in a fasttree-derived subtree, the uids will be out of order compared the the annotation keys for line in annotations.values(): # we may actually have the annotation for every subcluster (e.g. if --calculate-alternative-annotations was set), but in case we don't, this is fine if len(uid_set & set(line['unique_ids'])) > 0: # just take the first one with any overlap. Yeah, it's not necessarily the best, but its naive sequence probably isn't that different, and for just getting the fasttree it reeeeeeaaaallly doesn't matter return line raise Exception('couldn\'t find uid %s in annotations' % uid) def getseq(uid): line = getline(uid) return line['seqs'][line['unique_ids'].index(uid)] def lget(uid_list): return ':'.join(uid_list) # check for repeated uids (was only from seed uid, which shouldn't happen any more, but the code below throws an infinite loop if we do, so may as well be careful) for partition in partitions: if sum(len(c) for c in partition) > len(set(u for c in partition for u in c)): repeated_uids = [u for u, count in collections.Counter([u for c in partition for u in c]).items() if count > 1] raise Exception('found %d uid%s in more than one cluster (%s)' % (len(repeated_uids), utils.plural(len(repeated_uids)), ', '.join(repeated_uids))) default_edge_length = 999999 # it's nice to have the edges all set to something that's numeric (so the trees print), but also obvious wrong, if we forget to set somebody assert len(partitions[-1]) == 1 root_label = lget(partitions[-1][0]) # we want the order of the uids in the label to correspond to the order in self.partitions tns = dendropy.TaxonNamespace([root_label]) root_node = dendropy.Node(taxon=tns.get_taxon(root_label)) root_node.uids = uid_set # each node keeps track of the uids of its children dtree = dendropy.Tree(taxon_namespace=tns, seed_node=root_node) if debug: print ' starting tree with %d leaves' % len(uid_set) for ipart in reversed(range(len(partitions) - 1)): # dendropy seems to only have fcns to build a tree from the root downward, so we loop starting with the last partition (- 1 is because the last partition is guaranteed to be just one cluster) for lnode in dtree.leaf_node_iter(): # look for leaf nodes that contain uids from two clusters in this partition, and add those as children tclusts = [c for c in partitions[ipart] if len(set(c) & lnode.uids) > 0] if len(tclusts) < 2: continue for tclust in tclusts: ttaxon = dendropy.Taxon(lget(tclust)) tns.add_taxon(ttaxon) child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length) child.uids = set(tclust) if debug: print ' ipart %d' % ipart print ' split node: %d --> %s %s --> %s' % (len(lnode.uids), ' '.join([str(len(tc)) for tc in tclusts]), lnode.taxon.label, ' '.join([c.taxon.label for c in lnode.child_node_iter()])) # split existing leaves, which are probably not singletons (they're probably from the initial naive sequence collapse step) into subtrees such that each leaf is a singleton for lnode in dtree.leaf_node_iter(): if len(lnode.uids) == 1: continue if get_fasttrees and len(lnode.uids) > 2: seqfos = [{'name' : uid, 'seq' : getseq(uid)} for uid in lnode.taxon.label.split(':')] # may as well add them in the right order, although I don't think it matters subtree = treeutils.get_fasttree_tree(seqfos, getline(lnode.taxon.label, uid_set=lnode.uids)['naive_seq'], suppress_internal_node_taxa=True) # note that the fasttree distances get ignored below (no idea if they'd be better than what we set down there, but they probably wouldn't be consistent, so I'd rather ignore them) for tmpnode in subtree.postorder_node_iter(): if tmpnode.is_leaf(): tmpnode.uids = set([tmpnode.taxon.label]) else: tmpnode.uids = set([uid for c in tmpnode.child_node_iter() for uid in c.uids]) ttaxon = dendropy.Taxon(lget(tmpnode.uids)) subtree.taxon_namespace.add_taxon(ttaxon) tmpnode.taxon = ttaxon # ...and use the string of leaf nodes, even though they'll be in the wrong order (I think these get ignored when I call label_nodes() below, but it's still tidier to have them right in the meantime, and anyway since I'm suppressing internal taxa I think I need to set them to something) if debug: print ' adding subtree with %d leaves from fastree at leaf node %s' % (len(seqfos), lnode.taxon.label) print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=subtree)) dtree.taxon_namespace.add_taxa(subtree.taxon_namespace) lnode.add_child(subtree.seed_node) assert len(lnode.child_edges()) == 1 # we're iterating over leaves, so this should always be true lnode.child_edges()[0].collapse() else: # just add a star subtree for uid in lnode.taxon.label.split(':'): # may as well add them in the right order, although I don't think it matters ttaxon = dendropy.Taxon(uid) tns.add_taxon(ttaxon) child = lnode.new_child(taxon=ttaxon, edge_length=default_edge_length) child.uids = set([uid]) if debug: print ' added %d singleton children for %s' % (len(lnode.uids), lnode.taxon.label) # in order to set edge lengths, we need node sequences, so first set leaf node seqs for lnode in dtree.leaf_node_iter(): assert len(lnode.uids) == 1 lnode.seq = getseq(lnode.taxon.label) lnode.n_descendent_leaves = 1 # keep track of how many leaf nodes contributed to each node's consensus sequence (these are leaves, so it's trivally 1). This is less accurate than keeping track of all the sequences, but also faster # then set internal node seqs as the consensus of their children, and set the distance as hamming distance to child seqs if debug: print ' adding edge lengths either from fasttree %s or cons seq %s' % (utils.color('blue', 'x'), utils.color('red', 'x')) min_edge_length = None # setting this is nice for better debug viewing for node in dtree.postorder_internal_node_iter(): # includes root node child_cons_seq_counts = [c.n_descendent_leaves for c in node.child_node_iter()] total_descendent_leaves = sum(child_cons_seq_counts) if total_descendent_leaves > n_max_cons_seqs: # if there's tons of descendent leaves, we don't want to pass them all to the consensus fcn since it's slow, so we choose them in proportion to their actual proportions, but scaled down to <n_max_cons_seqs> child_cons_seq_counts = [int(n_max_cons_seqs * csc / float(total_descendent_leaves)) for csc in child_cons_seq_counts] child_cons_seq_counts = [max(1, csc) for csc in child_cons_seq_counts] # don't eliminate any sequences entirely (this makes the proportions less accurate (in some cases), but is the easy way to handle the case where there's a ton of singleton children if debug: print ' %s' % utils.color('green', node.taxon.label) csc_str = ' (reduced: %s)' % ' '.join([str(csc) for csc in child_cons_seq_counts]) if total_descendent_leaves > n_max_cons_seqs else '' print ' desc leaves per child: %s%s' % (' '.join(str(c.n_descendent_leaves) for c in node.child_node_iter()), csc_str) child_seqfos = [{'name' : cn.taxon.label + '-leaf-' + str(il), 'seq' : cn.seq} for cn, count in zip(node.child_node_iter(), child_cons_seq_counts) for il in range(count)] node.seq = utils.cons_seq(0.01, aligned_seqfos=child_seqfos, tie_resolver_seq=getline(root_label)['naive_seq']) #, debug=debug) # the consensus has an N at every position where the constituent sequences gave a tie. But Ns screw up the distances (especially because once we *get* an N, we can't get rid of it and it's propagated all the way up the tree), and in almost all cases the correct choice should be the naive base, so we use that node.n_descendent_leaves = total_descendent_leaves for edge in node.child_edge_iter(): from_fasttree = False if edge.length == default_edge_length: # otherwise it was set by fasttree, and it's probably better than what we'd get from this (it'd be nice to skip the cons seq stuff for the whole fasttree subtree, but then we don't have the cons seqs we need for later) edge.length = utils.hamming_distance(edge.head_node.seq, node.seq) / float(len(node.seq)) else: from_fasttree = True if min_edge_length is not None: edge.length = max(min_edge_length, edge.length) if debug: print ' %6.3f %s %s' % (edge.length, utils.color('blue' if from_fasttree else 'red', 'x'), edge.head_node.taxon.label) if debug: print ' naive seq %s' % getline(root_label)['naive_seq'] # NOTE might be worthwhile to add an edge connecting seed node and the actual naive sequence (i.e. for cases where our approximate naive is off) print ' root cons seq %s' % utils.color_mutants(getline(root_label)['naive_seq'], dtree.seed_node.seq) for node in dtree.preorder_node_iter(): del node.uids del node.seq del node.n_descendent_leaves treeutils.label_nodes(dtree, ignore_existing_internal_node_labels=True, ignore_existing_internal_taxon_labels=True, debug=debug) dtree.update_bipartitions() # probably don't really need this if debug: print treeutils.utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=dtree, width=250)) return dtree
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append(reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [indelutils.get_empty_indel() for _ in range(len(reco_event.final_seqs))] return # When generating trees, each tree's number of leaves and total depth are chosen from the specified distributions (a.t.m., by default n-leaves is from a geometric/zipf, and depth is from data) # This chosen depth corresponds to the sequence-wide mutation frequency. # In order to account for varying mutation rates in v, d, and j we simulate these regions separately, by appropriately rescaling the tree for each region. # i.e.: here we get the sequence-wide mute freq from the tree, and rescale it by the repertoire-wide ratios from data (which are stored in the tree file). # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the same for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file treefostr = self.treeinfo[random.randint(0, len(self.treeinfo)-1)] # per-region mutation info is tacked on after the tree... sigh. kind of hackey but works ok. assert treefostr.count(';') == 1 isplit = treefostr.find(';') + 1 chosen_tree = treefostr[:isplit] # includes semi-colon reco_event.set_tree(chosen_tree) # leaf names are still just like t<n> mutefo = [rstr for rstr in treefostr[isplit:].split(',')] mean_total_height = treeutils.get_mean_leaf_height(treestr=chosen_tree) regional_heights = {} # per-region height, including <self.args.mutation_multiplier> for tmpstr in mutefo: region, ratio = tmpstr.split(':') assert region in utils.regions ratio = float(ratio) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor ratio *= self.args.mutation_multiplier regional_heights[region] = mean_total_height * ratio scaled_trees = {r : treeutils.rescale_tree(regional_heights[r], treestr=chosen_tree) for r in utils.regions} if self.args.debug: print ' chose tree with total height %f' % treeutils.get_mean_leaf_height(treestr=chosen_tree) print ' regional trees rescaled to heights: %s' % (' '.join(['%s %.3f (expected %.3f)' % (region, treeutils.get_mean_leaf_height(treestr=scaled_trees[region]), regional_heights[region]) for region in utils.regions])) n_leaves = treeutils.get_n_leaves(treeutils.get_dendro_tree(treestr=chosen_tree, schema='newick')) cmdfos = [] regional_naive_seqs = {} # only used for tree checking for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions['vd'] + simstr + reco_event.insertions['dj'] cmdfos.append(self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaves, reco_event.genes[region], reco_event, seed=irandom)) regional_naive_seqs[region] = simstr utils.run_cmds([cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range(len(utils.regions)): # NOTE kind of sketchy just using index in <utils.regions> (although it just depends on the loop immediately above a.t.m.) if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = ['' for _ in range(n_leaves)] # return an empty string for each leaf node else: tmp_names, tmp_seqs = self.read_bppseqgen_output(cmdfos[ireg], n_leaves) if reco_event.leaf_names is None: reco_event.leaf_names = tmp_names assert reco_event.leaf_names == tmp_names # enforce different regions having same name + ordering (although this is already enforced when reading bppseqgen output) mseqs[utils.regions[ireg]] = tmp_seqs assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaves): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons(seq, debug=self.args.debug) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append(seq) # set final sequnce in reco_event reco_event.final_codon_positions.append(copy.deepcopy(reco_event.post_erosion_codon_positions)) # separate codon positions for each sequence, because of shm indels self.add_shm_indels(reco_event) reco_event.setline(irandom) # set the line here because we use it when checking tree simulation, and want to make sure the uids are always set at the same point in the workflow # self.check_tree_simulation(mean_total_height, regional_heights, chosen_tree, scaled_trees, regional_naive_seqs, mseqs, reco_event) # self.print_validation_values() if self.args.debug: print ' tree passed to bppseqgen:' print treeutils.get_ascii_tree(dendro_tree=reco_event.tree, extra_str=' ') utils.print_reco_event(reco_event.line, extra_str=' ')
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line