def get_rescaled_trees(self, treestr, branch_length_ratios, debug=False): """ Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for v, d, and j """ rescaled_trees = {} if debug: print ' rescaling tree:' for region in utils.regions: # rescale the tree rescaled_trees[region] = treegenerator.rescale_tree( treestr, branch_length_ratios[region]) if debug: print ' %s by %f (new depth %f): %s -> %s' % ( region, branch_length_ratios[region], treegenerator.get_leaf_node_depths( rescaled_trees[region])['t1'], treestr, rescaled_trees[region]) # and then check it NOTE can remove this eventually initial_depths = {} for node, depth in treegenerator.get_leaf_node_depths( treestr).items(): initial_depths[node] = depth for node, depth in treegenerator.get_leaf_node_depths( rescaled_trees[region]).items(): depth_ratio = depth / initial_depths[node] assert utils.is_normed(depth_ratio / branch_length_ratios[region], this_eps=1e-6) return rescaled_trees
def add_mutants(self, reco_event, irandom): chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo)-1)] chosen_tree = chosen_treeinfo.split(';')[0] + ';' branch_length_ratios = {} # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file for tmpstr in chosen_treeinfo.split(';')[1].split(','): # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) region = tmpstr.split(':')[0] assert region in utils.regions ratio = float(tmpstr.split(':')[1]) if self.args.branch_length_multiplier != None: # multiply the branch lengths by some factor # if self.args.debug: # print ' adding branch length factor %f ' % self.args.branch_length_multiplier ratio *= self.args.branch_length_multiplier branch_length_ratios[region] = ratio if self.args.debug: # NOTE should be the same for t[0-9]... but I guess I should check at some point print ' using tree with total depth %f' % treegenerator.get_leaf_node_depths(chosen_tree)['t1'] # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes... Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick')) print ' with branch length ratios ', ', '.join([ '%s %f' % (region, branch_length_ratios[region]) for region in utils.regions]) scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios) # NOTE would be nice to parallelize this mutes = {} for region in utils.regions: mutes[region] = self.run_bppseqgen(reco_event.eroded_seqs[region], scaled_trees[region], reco_event.genes[region], reco_event, seed=irandom, is_insertion=False) mutes['vd'] = self.run_bppseqgen(reco_event.insertions['vd'], scaled_trees['v'], 'vd_insert', reco_event, seed=irandom, is_insertion=True) # NOTE would be nice to use a better mutation model for the insertions mutes['dj'] = self.run_bppseqgen(reco_event.insertions['dj'], scaled_trees['j'], 'dj_insert', reco_event, seed=irandom, is_insertion=True) assert len(reco_event.final_seqs) == 0 for iseq in range(len(mutes['v'])): seq = mutes['v'][iseq] + mutes['vd'][iseq] + mutes['d'][iseq] + mutes['dj'][iseq] + mutes['j'][iseq] # build final sequence seq = reco_event.revert_conserved_codons(seq) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append(seq) # set final sequnce in reco_event assert not utils.are_conserved_codons_screwed_up(reco_event)
def get_rescaled_trees(self, treestr, branch_length_ratios): """ Trees are generated with the mean branch length observed in data over the whole sequence, because we want to use topologically the same tree for the whole sequence. But we observe different branch lengths for each region, so we need to rescale the tree for v, d, and j """ rescaled_trees = {} for region in utils.regions: # rescale the tree rescaled_trees[region] = treegenerator.rescale_tree(treestr, branch_length_ratios[region]) # print 'rescaled %s by %f: %s -> %s' % (region, branch_length_ratios[region], treestr, rescaled_trees[region]) # and then check it NOTE can remove this eventually initial_depths = {} for node, depth in treegenerator.get_leaf_node_depths(treestr).items(): initial_depths[node] = depth for node, depth in treegenerator.get_leaf_node_depths(rescaled_trees[region]).items(): depth_ratio = depth / initial_depths[node] assert utils.is_normed(depth_ratio / branch_length_ratios[region], this_eps=1e-6) return rescaled_trees
def add_mutants(self, reco_event, irandom): chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo)-1)] chosen_tree = chosen_treeinfo.split(';')[0] + ';' branch_length_ratios = {} # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file for tmpstr in chosen_treeinfo.split(';')[1].split(','): # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) region = tmpstr.split(':')[0] assert region in utils.regions ratio = float(tmpstr.split(':')[1]) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor # if self.args.debug: # print ' adding branch length factor %f ' % self.args.mutation_multiplier ratio *= self.args.mutation_multiplier branch_length_ratios[region] = ratio if self.args.debug: # NOTE should be the same for t[0-9]... but I guess I should check at some point print ' using tree with total depth %f' % treegenerator.get_leaf_node_depths(chosen_tree)['t1'] # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes... if len(re.findall('t', chosen_tree)) > 1: # if more than one leaf Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick')) else: print ' one leaf' print ' with branch length ratios ', ', '.join(['%s %f' % (region, branch_length_ratios[region]) for region in utils.regions]) scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios) treg = re.compile('t[0-9][0-9]*') n_leaf_nodes = len(treg.findall(chosen_tree)) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions['vd'] + simstr + reco_event.insertions['dj'] cmdfos.append(self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaf_nodes, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds([cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range(len(utils.regions)): if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = ['' for _ in range(n_leaf_nodes)] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output(cmdfos[ireg], n_leaf_nodes) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaf_nodes): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons(seq) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append(seq) # set final sequnce in reco_event self.add_shm_indels(reco_event)
def add_mutants(self, reco_event, irandom): chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo)-1)] chosen_tree = chosen_treeinfo.split(';')[0] + ';' branch_length_ratios = {} # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file for tmpstr in chosen_treeinfo.split(';')[1].split(','): # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) region = tmpstr.split(':')[0] assert region in utils.regions ratio = float(tmpstr.split(':')[1]) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor # if self.args.debug: # print ' adding branch length factor %f ' % self.args.mutation_multiplier ratio *= self.args.mutation_multiplier branch_length_ratios[region] = ratio if self.args.debug: # NOTE should be the same for t[0-9]... but I guess I should check at some point print ' using tree with total depth %f' % treegenerator.get_leaf_node_depths(chosen_tree)['t1'] # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes... if len(re.findall('t', chosen_tree)) > 1: # if more than one leaf Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick')) else: print ' one leaf' print ' with branch length ratios ', ', '.join(['%s %f' % (region, branch_length_ratios[region]) for region in utils.regions]) scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios) # NOTE would be nice to parallelize this mutes = {} for region in utils.regions: mutes[region] = self.run_bppseqgen(reco_event.eroded_seqs[region], scaled_trees[region], reco_event.genes[region], reco_event, seed=irandom, is_insertion=False) mutes['vd'] = self.run_bppseqgen(reco_event.insertions['vd'], scaled_trees['v'], 'vd_insert', reco_event, seed=irandom, is_insertion=True) # NOTE would be nice to use a better mutation model for the insertions mutes['dj'] = self.run_bppseqgen(reco_event.insertions['dj'], scaled_trees['j'], 'dj_insert', reco_event, seed=irandom, is_insertion=True) assert len(reco_event.final_seqs) == 0 for iseq in range(len(mutes['v'])): seq = mutes['v'][iseq] + mutes['vd'][iseq] + mutes['d'][iseq] + mutes['dj'][iseq] + mutes['j'][iseq] # build final sequence seq = reco_event.revert_conserved_codons(seq) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append(seq) # set final sequnce in reco_event assert not utils.are_conserved_codons_screwed_up(reco_event) self.add_shm_indels(reco_event)
def add_mutants(self, reco_event, irandom): if self.args.mutation_multiplier is not None and self.args.mutation_multiplier == 0.: # some of the stuff below fails if mut mult is actually 0. reco_event.final_seqs.append( reco_event.recombined_seq) # set final sequnce in reco_event reco_event.indelfos = [ utils.get_empty_indel() for _ in range(len(reco_event.final_seqs)) ] return chosen_treeinfo = self.treeinfo[random.randint(0, len(self.treeinfo) - 1)] chosen_tree = chosen_treeinfo.split(';')[0] + ';' branch_length_ratios = { } # NOTE a.t.m (and probably permanently) the mean branch lengths for each region are the *same* for all the trees in the file, I just don't have a better place to put them while I'm passing from TreeGenerator to here than at the end of each line in the file for tmpstr in chosen_treeinfo.split(';')[1].split( ',' ): # looks like e.g.: (t2:0.003751736951,t1:0.003751736951):0.001248262937;v:0.98,d:1.8,j:0.87, where the newick trees has branch lengths corresponding to the whole sequence (i.e. the weighted mean of v, d, and j) region = tmpstr.split(':')[0] assert region in utils.regions ratio = float(tmpstr.split(':')[1]) if self.args.mutation_multiplier is not None: # multiply the branch lengths by some factor # if self.args.debug: # print ' adding branch length factor %f ' % self.args.mutation_multiplier ratio *= self.args.mutation_multiplier branch_length_ratios[region] = ratio if self.args.debug: # NOTE should be the same for t[0-9]... but I guess I should check at some point print ' using tree with total depth %f' % treegenerator.get_leaf_node_depths( chosen_tree )['t1'] # kind of hackey to just look at t1, but they're all the same anyway and it's just for printing purposes... if len(re.findall('t', chosen_tree)) > 1: # if more than one leaf Phylo.draw_ascii(Phylo.read(StringIO(chosen_tree), 'newick')) else: print ' one leaf' print ' with branch length ratios ', ', '.join([ '%s %f' % (region, branch_length_ratios[region]) for region in utils.regions ]) scaled_trees = self.get_rescaled_trees(chosen_tree, branch_length_ratios) treg = re.compile('t[0-9][0-9]*') n_leaf_nodes = len(treg.findall(chosen_tree)) cmdfos = [] for region in utils.regions: simstr = reco_event.eroded_seqs[region] if region == 'd': simstr = reco_event.insertions[ 'vd'] + simstr + reco_event.insertions['dj'] cmdfos.append( self.prepare_bppseqgen(simstr, scaled_trees[region], n_leaf_nodes, reco_event.genes[region], reco_event, seed=irandom)) utils.run_cmds( [cfo for cfo in cmdfos if cfo is not None], sleep=False) # shenanigan is to handle zero-length regional seqs mseqs = {} for ireg in range(len(utils.regions)): if cmdfos[ireg] is None: mseqs[utils.regions[ireg]] = [ '' for _ in range(n_leaf_nodes) ] # return an empty string for each leaf node else: mseqs[utils.regions[ireg]] = self.read_bppseqgen_output( cmdfos[ireg], n_leaf_nodes) assert len(reco_event.final_seqs) == 0 for iseq in range(n_leaf_nodes): seq = mseqs['v'][iseq] + mseqs['d'][iseq] + mseqs['j'][iseq] seq = reco_event.revert_conserved_codons( seq ) # if mutation screwed up the conserved codons, just switch 'em back to what they were to start with reco_event.final_seqs.append( seq) # set final sequnce in reco_event self.add_shm_indels(reco_event)