def make_gls_tree_plot(args, region, plotdir, plotname, glsfnames, glslabels, locus, ref_label=None, title=None, title_color=None, legends=None, legend_title=None, pie_chart_faces=False, param_dirs=None): raise Exception('needs to be tested for switch to utils.run_ete_script() (should work ok, I just don\'t want to run it now)') cmdstr = './bin/plot-gl-set-trees.py' cmdstr += ' --plotdir ' + plotdir cmdstr += ' --plotname ' + plotname cmdstr += ' --glsfnames ' + ':'.join(glsfnames) cmdstr += ' --glslabels ' + ':'.join(glslabels) cmdstr += ' --region ' + region if ref_label is not None: cmdstr += ' --ref-label ' + ref_label if title is not None: cmdstr += ' --title="%s"' % title if title_color is not None: cmdstr += ' --title-color %s' % title_color if legends is not None: cmdstr += ' --legends=' + ':'.join('"%s"' % l for l in legends) if legend_title is not None: cmdstr += ' --legend-title="%s"' % legend_title if pie_chart_faces: cmdstr += ' --pie-chart-faces' if param_dirs is not None: cmdstr += ' --param-dirs %s' % ':'.join(param_dirs) cmdstr += ' --locus ' + locus if args.plotcache: cmdstr += ' --use-cache' if args.only_print: cmdstr += ' --only-print' utils.run_ete_script(cmdstr, args.ete_path, debug=args.dryrun, dryrun=args.dryrun, extra_str=' ')
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events, uid_str_len=None): if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4): return None cmd = '%s/bin/simulator.py' % bcr_phylo_path if args.run_help: cmd += ' --help' elif args.stype == 'neutral': assert False # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args) cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365) cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation elif args.stype == 'selection': cmd += ' --selection' cmd += ' --lambda %f' % args.branching_parameter cmd += ' --lambda0 %f' % args.base_mutation_rate cmd += ' --selection_strength %f' % get_vpar_val('selection-strength', args.selection_strength) cmd += ' --obs_times %s' % ' '.join(['%d' % get_vpar_val('obs-times', t) for t in args.obs_times]) cmd += ' --n_to_sample %s' % ' '.join('%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation) cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance if args.paratope_positions is not None: cmd += ' --paratope_positions %s' % args.paratope_positions cmd += ' --target_dist %d' % args.target_distance cmd += ' --target_count %d' % args.target_count cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap) if not args.dont_observe_common_ancestors: cmd += ' --observe_common_ancestors' if args.leaf_sampling_scheme is not None: cmd += ' --leaf_sampling_scheme %s' % args.leaf_sampling_scheme if args.n_target_clusters is not None: cmd += ' --n_target_clusters %d' % args.n_target_clusters # cmd += ' --target_cluster_distance 1' if args.min_target_distance is not None: cmd += ' --min_target_distance %d' % args.min_target_distance else: assert False cmd += ' --debug %d' % args.debug cmd += ' --n_tries 1000' if args.context_depend == 0: cmd += ' --no_context' cmd += ' --no_plot' if args.only_csv_plots: cmd += ' --dont_write_hists' cmd += ' --outbase %s/%s' % (outdir, args.extrastr) cmd += ' --random_seed %d' % (args.seed + ievent) if uid_str_len is not None: cmd += ' --uid_str_len %d' % uid_str_len cmd += ' --naive_seq %s' % naive_line['naive_seq'] if not os.path.exists(outdir): os.makedirs(outdir) cfo = None if args.n_procs == 1: utils.run_ete_script(cmd, ete_path) # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange() else: cmd, _ = utils.run_ete_script(cmd, ete_path, return_for_cmdfos=True, tmpdir=outdir) cfo = {'cmd_str' : cmd, 'workdir' : outdir, 'outfname' : bcr_phylo_fasta_fname(outdir)} return cfo
def run_bcr_phylo(naive_line, outdir, ievent, n_total_events): if utils.output_exists(args, bcr_phylo_fasta_fname(outdir), outlabel='bcr-phylo', offset=4): return cmd = '%s/bin/simulator.py' % bcr_phylo_path if args.run_help: cmd += ' --help' elif args.stype == 'neutral': assert False # needs updating (well, maybe not, but I'm not thinking about it when I move the selection parameters to command line args) cmd += ' --lambda %f --lambda0 %f' % (1.5, 0.365) cmd += ' --n_final_seqs %d' % args.n_sim_seqs_per_generation elif args.stype == 'selection': cmd += ' --selection' cmd += ' --lambda %f' % args.branching_parameter cmd += ' --lambda0 %f' % args.base_mutation_rate cmd += ' --selection_strength %f' % get_vpar_val( 'selection-strength', args.selection_strength) cmd += ' --obs_times %s' % ' '.join( ['%d' % get_vpar_val('obs-times', t) for t in args.obs_times]) cmd += ' --n_to_sample %s' % ' '.join( '%d' % get_vpar_val('n-sim-seqs-per-generation', n) for n in args.n_sim_seqs_per_generation) cmd += ' --metric_for_target_dist %s' % args.metric_for_target_distance cmd += ' --target_dist %d' % args.target_distance cmd += ' --target_count %d' % args.target_count cmd += ' --carry_cap %d' % get_vpar_val('carry-cap', args.carry_cap) if not args.dont_observe_common_ancestors: cmd += ' --observe_common_ancestors' # cmd += ' --n_target_clusters 1' # cmd += ' --target_cluster_distance 1' # cmd += ' --observe_based_on_affinity' # implementation in bcr-phylo needs some work else: assert False cmd += ' --debug %d' % args.debug cmd += ' --n_tries 30' cmd += ' --no_context' cmd += ' --no_plot' cmd += ' --outbase %s/%s' % (outdir, args.extrastr) cmd += ' --random_seed %d' % (args.seed + ievent) if n_total_events > 1: # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates cmd += ' --uid_str_len 7' cmd += ' --naive_seq %s' % naive_line['naive_seq'] if not os.path.exists(outdir): os.makedirs(outdir) utils.run_ete_script( cmd, ete_path ) # NOTE kind of hard to add a --dry-run option, since we have to loop over the events we made in rearrange()
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline utils.add_implicit_info(glfo, mline) final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s/kd-vals.csv --newick-tree-file %s/simu.nwk' % ( outdir, args.extrastr, outdir, outdir) utils.run_ete_script(cmd, ete_path) nodefo = {} with open('%s/kd-vals.csv' % outdir) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] tree = treeutils.get_dendro_tree(treefname='%s/simu.nwk' % outdir) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def parse_bcr_phylo_output(glfo, naive_line, outdir, ievent): seqfos = utils.read_fastx(bcr_phylo_fasta_fname(outdir)) # output mutated sequences from bcr-phylo assert len(naive_line['unique_ids']) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels(naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in seqfos: mline = copy.deepcopy(naive_line) utils.remove_all_implicit_info(mline) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['input_seqs'] = [sfo['seq']] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % (ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines(''.join(lines)) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info([sfo['name'] for sfo in seqfos], reco_info) if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if args.stype == 'selection': kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists(args, kdfname, outlabel='kd/nwk conversion', offset=4): # eh, don't really need to check for both kd an nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % (outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs==1) nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd' : float(line['kd']), 'relative_kd' : float(line['relative_kd']), 'lambda' : line.get('lambda', None), 'target_index' : int(line['target_index']), } if len(set(nodefo) - set(final_line['unique_ids'])) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % (set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join(set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [1. / nodefo[u]['kd'] for u in final_line['unique_ids']] final_line['relative_affinities'] = [1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids']] final_line['lambdas'] = [nodefo[u]['lambda'] for u in final_line['unique_ids']] final_line['nearest_target_indices'] = [nodefo[u]['target_index'] for u in final_line['unique_ids']] tree = treeutils.get_dendro_tree(treefname=nwkfname) tree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=tree), padwidth=12) final_line['tree'] = tree.as_string(schema='newick') tmp_event = RecombinationEvent(glfo) # I don't want to move the function out of event.py right now tmp_event.set_reco_id(final_line, irandom=ievent) # not sure that setting <irandom> here actually does anything # get target sequences target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) final_line['target_seqs'] = [tfo['seq'] for tfo in target_seqfos] return final_line
def parse_bcr_phylo_output(glfos, naive_events, outdir, ievent): # ---------------------------------------------------------------------------------------- def split_seqfos(seqfos): hline, lline = naive_events[ievent] hseqfos, lseqfos = [], [] for sfo in seqfos: padseq = utils.pad_nuc_seq(hline['naive_seq']) assert len(sfo['seq']) == len(padseq) + len(lline['naive_seq']) hseqfos.append({ 'name': sfo['name'], 'seq': sfo['seq'][:len(hline['naive_seq'])] }) lseqfos.append({ 'name': sfo['name'], 'seq': sfo['seq'][len(padseq):] }) return hseqfos, lseqfos # ---------------------------------------------------------------------------------------- def read_kdvals(kdfname): nodefo = {} with open(kdfname) as kdfile: reader = csv.DictReader(kdfile) for line in reader: nodefo[line['uid']] = { 'kd': float(line['kd']), 'relative_kd': float(line['relative_kd']), 'lambda': line.get('lambda', None), 'target_index': int(line['target_index']), } return nodefo # ---------------------------------------------------------------------------------------- def get_mature_line(sfos, naive_line, glfo, nodefo, dtree, target_sfos, locus=None): assert len( naive_line['unique_ids'] ) == 1 # enforces that we ran naive-only, 1-leaf partis simulation above assert not indelutils.has_indels( naive_line['indelfos'][0]) # would have to handle this below if args.debug: utils.print_reco_event(naive_line) reco_info = collections.OrderedDict() for sfo in sfos: mline = utils.get_non_implicit_copy(naive_line) del mline['tree'] mline['unique_ids'] = [sfo['name']] mline['seqs'] = [sfo['seq']] mline['input_seqs'] = [ sfo['seq'] ] # it's really important to set both the seqs (since they're both already in there from the naive line) mline['duplicates'] = [[]] reco_info[sfo['name']] = mline try: utils.add_implicit_info(glfo, mline) except: # TODO not sure if I really want to leave this in long term, but it shouldn't hurt anything (it's crashing on unequal naive/mature sequence lengths, and I need this to track down which event it is) UPDATE: yeah it was just because something crashed in the middle of writing a .fa file print 'implicit info adding failed for ievent %d in %s' % ( ievent, outdir) lines = traceback.format_exception(*sys.exc_info()) print utils.pad_lines( ''.join(lines) ) # NOTE this will still crash on the next line if implicit info adding failed final_line = utils.synthesize_multi_seq_line_from_reco_info( [sfo['name'] for sfo in sfos], reco_info) ftree = copy.deepcopy(dtree) if locus is not None: def ltr(u): return u + '-' + locus new_nodefo = {} for u_old in nodefo: new_nodefo[ltr(u_old)] = nodefo[u_old] nodefo = new_nodefo treeutils.translate_labels(ftree, [(u, ltr(u)) for u in final_line['unique_ids']]) final_line['unique_ids'] = [ ltr(u) for u in final_line['unique_ids'] ] assert len(sfos) == len(final_line['unique_ids']) for iseq, sfo in enumerate(sfos): naive_id = naive_line['unique_ids'][0] assert naive_id.count('-') == 1 bstr = naive_id.replace('-' + locus, '') pids = final_line['paired-uids'][iseq] assert len(pids) == 1 and pids[0].find( bstr ) == 0 and pids[0].count('-') == 1 and pids[0].split( '-' )[1] in utils.loci # if uid is xxx-igh, paired id shoud be e.g. xxx-igk final_line['paired-uids'][iseq] = [ p.replace(bstr, sfo['name']) for p in pids ] if args.debug: utils.print_reco_event(final_line) # extract kd values from pickle file (use a separate script since it requires ete/anaconda to read) if len( set(nodefo) - set(final_line['unique_ids']) ) > 0: # uids in the kd file but not the <line> (i.e. not in the newick/fasta files) are probably just bcr-phylo discarding internal nodes print ' in kd file, but missing from final_line (probably just internal nodes that bcr-phylo wrote to the tree without names): %s' % ( set(nodefo) - set(final_line['unique_ids'])) if len(set(final_line['unique_ids']) - set(nodefo)) > 0: print ' in final_line, but missing from kdvals: %s' % ' '.join( set(final_line['unique_ids']) - set(nodefo)) final_line['affinities'] = [ 1. / nodefo[u]['kd'] for u in final_line['unique_ids'] ] final_line['relative_affinities'] = [ 1. / nodefo[u]['relative_kd'] for u in final_line['unique_ids'] ] final_line['lambdas'] = [ nodefo[u]['lambda'] for u in final_line['unique_ids'] ] final_line['nearest_target_indices'] = [ nodefo[u]['target_index'] for u in final_line['unique_ids'] ] ftree.scale_edges(1. / numpy.mean([len(s) for s in final_line['seqs']])) if args.debug: print utils.pad_lines(treeutils.get_ascii_tree(dendro_tree=ftree), padwidth=12) final_line['tree'] = ftree.as_string(schema='newick') tmp_event = RecombinationEvent( glfo ) # I don't want to move the function out of event.py right now tmp_event.set_reco_id( final_line, irandom=ievent ) # not sure that setting <irandom> here actually does anything final_line['target_seqs'] = [tfo['seq'] for tfo in target_sfos] return final_line # ---------------------------------------------------------------------------------------- assert args.stype == 'selection' # i don't know that non-'selection' is possible or has any point at this point (can just set selection strength to zero) kdfname, nwkfname = '%s/kd-vals.csv' % outdir, '%s/simu.nwk' % outdir if not utils.output_exists( args, kdfname, outlabel='kd/nwk conversion', offset=4 ): # eh, don't really need to check for both kd and nwk file, chances of only one being missing are really small, and it'll just crash when it looks for it a couple lines later cmd = './bin/read-bcr-phylo-trees.py --pickle-tree-file %s/%s_lineage_tree.p --kdfile %s --newick-tree-file %s' % ( outdir, args.extrastr, kdfname, nwkfname) utils.run_ete_script(cmd, ete_path, debug=args.n_procs == 1) nodefo = read_kdvals(kdfname) dtree = treeutils.get_dendro_tree(treefname=nwkfname) seqfos = utils.read_fastx(bcr_phylo_fasta_fname( outdir)) # output mutated sequences from bcr-phylo target_seqfos = utils.read_fastx('%s/%s_targets.fa' % (outdir, args.extrastr)) if args.paired_loci: mevents = [] for tline, sfos, tsfos in zip(naive_events[ievent], split_seqfos(seqfos), split_seqfos(target_seqfos)): mevents.append( get_mature_line(sfos, tline, glfos[tline['loci'][0]], nodefo, dtree, target_seqfos, locus=tline['loci'][0])) return mevents else: return get_mature_line(seqfos, naive_events[ievent], glfos[0], nodefo, dtree, target_seqfos)