def parse_ramesh_seqs(glseqs, outdir, debug=False): for locus in glseqs: glutils.remove_glfo_files(outdir, locus) # write to a glfo dir without extra info for region in glseqs[locus]: fn = glutils.get_fname(outdir, locus, region) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'w') as ofile: for gene, seq in glseqs[locus][region].items(): ofile.write('>%s\n%s\n' % (gene, seq)) # figure out extra info template_glfo = glutils.read_glfo('data/germlines/macaque', locus) glfo = glutils.read_glfo(outdir, locus, template_glfo=template_glfo, remove_bad_genes=True, debug=True) # trim non-coding stuff upstream of v (and remove non-full-length ones) gene_groups = {} for region in ['v']: group_labels = sorted( set([utils.gene_family(g) for g in glfo['seqs'][region]])) gene_groups[region] = [(glabel, { g: glfo['seqs'][region][g] for g in glfo['seqs'][region] if utils.gene_family(g) == glabel }) for glabel in group_labels] for region in [r for r in utils.regions if r in gene_groups]: if debug: print '%s' % utils.color('reverse_video', utils.color('green', region)) for group_label, group_seqs in gene_groups[ region]: # ok, this isn't really doing anything any more if debug: print ' %s' % utils.color('blue', group_label) for gene, seq in group_seqs.items(): trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=debug) # remove any seqs with ambiguous bases for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True)
def run_partis_parameter_cache(args, method): if utils.output_exists(args, get_outfname(args, method)): return paramdir = args.outdir + '/' + method plotdir = args.outdir + '/' + method + '/plots' # remove any old sw cache files sw_cachefiles = glob.glob(paramdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, args.locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = args.partis_path + ' cache-parameters --infname ' + args.simfname + ' --only-smith-waterman' cmd_str += ' --initial-germline-dir %s' % args.default_germline_dir if method == 'partis': cmd_str += ' --debug-allele-finding' # --always-find-new-alleles' cmd_str += ' --is-simu --simulation-germline-dir ' + args.outdir + '/germlines/simulation' # alleleclusterer is the only one that really uses this, but for now I want its dbg output to have the sim info if args.allele_cluster: cmd_str += ' --allele-cluster' if args.kmeans_allele_cluster: cmd_str += ' --kmeans-allele-cluster' elif method == 'full': cmd_str += ' --leave-default-germline' else: assert False if args.species != 'human': cmd_str += ' --species %s' % args.species cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' if not args.gls_gen: # otherwise it uses the default (full) germline dir cmd_str += ' --initial-germline-dir ' + args.inf_glfo_dir # --dont-remove-unlikely-alleles cmd_str += ' --parameter-dir ' + paramdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) if args.plot_and_fit_absolutely_everything is not None: cmd_str += ' --plot-and-fit-absolutely-everything ' + str( args.plot_and_fit_absolutely_everything) utils.simplerun(cmd_str, dryrun=args.dryrun)
def write( self, base_outdir ): # NOTE most of the time in here is taken up by mutefrequer.finalize() (if it plot() wasn't called first, that is) print ' writing parameters to %s' % base_outdir, sys.stdout.flush() start = time.time() if os.path.exists(base_outdir + '/' + glutils.glfo_dir): for tmploc in [ l for l in utils.loci if os.path.exists(base_outdir + '/' + glutils.glfo_dir + '/' + l) ]: glutils.remove_glfo_files(base_outdir + '/' + glutils.glfo_dir, tmploc, print_warning=False) utils.prep_dir( base_outdir, subdirs=('hmms', 'mute-freqs', glutils.glfo_dir), wildlings=('*.csv', '*.yaml', '*.fasta') ) # it's kind of hackey to specify the /hmms dir here, but as soon as we write the parameters below, the previous yamels are out of date, so it's pretty much necessary self.mfreqer.write( base_outdir + '/mute-freqs', mean_freq_outfname=base_outdir + '/REGION-mean-mute-freqs.csv' ) # REGION is replace by each region in the three output files) genes_with_counts = [ g[0] for r in utils.regions for g in self.counts[r + '_gene'].keys() ] glutils.write_glfo(base_outdir + '/' + glutils.glfo_dir, self.glfo, only_genes=genes_with_counts, debug=False) for column in self.counts: index = None outfname = None if column in self.no_write_columns: continue elif column == 'all': index = tuple(list(utils.index_columns) + [ 'cdr3_length', ]) outfname = base_outdir + '/' + utils.get_parameter_fname( column='all') elif '_content' in column or column == 'cluster_size': index = [ column, ] outfname = base_outdir + '/' + column + '.csv' else: index = [ column, ] + utils.column_dependencies[column] outfname = base_outdir + '/' + utils.get_parameter_fname( column_and_deps=index) if os.path.isfile(outfname): os.remove(outfname) elif not os.path.exists(base_outdir): os.makedirs(base_outdir) with open(outfname, 'w') as outfile: out_fieldnames = list(index) out_fieldnames.append('count') out_data = csv.DictWriter(outfile, out_fieldnames) out_data.writeheader() # NOTE this will in general not be sorted for key, count in self.counts[column].iteritems(): line = {} for ic in range(len(key)): line[index[ic]] = key[ic] line['count'] = count out_data.writerow(line) print '(%.1f sec)' % (time.time() - start)
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str(args.n_sim_events) + ' --n-leaves ' + str(args.n_leaves) + ' --constant-number-of-leaves --rearrange-from-scratch --outfname ' + simfname cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{'gene' : args.sim_v_genes[ig], 'positions' : args.snp_positions[ig]} for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps(snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len(sglfo['seqs']['v']): raise Exception('--allele-prevalence-freqs not the right length') gene_list = sorted(sglfo['seqs']['v']) if added_snp_names is None else list(set(args.sim_v_genes)) + added_snp_names prevalence_freqs = {'v' : {g : f for g, f in zip(gene_list, args.allele_prevalence_freqs)}, 'd' : {}, 'j' : {}} glutils.write_allele_prevalence_freqs(prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists(sw_cache_gldir): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, chain) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm' if args.gen_gset: cmd_str += ' --find-new-alleles' else: inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', chain=chain, only_genes=inference_genes.split(':'), debug=True) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' cmd_str += ' --find-new-alleles --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)
def run_test(args): print 'seed %d' % args.seed label = 'test' #get_label(existing_genes, new_allele) simfname = args.outdir + '/simu-' + label + '.csv' outpdir = args.outdir + '/simu-' + label plotdir = args.outdir + '/simu-' + label + '-plots' # simulate if not args.nosim: cmd_str = base_cmd + ' simulate --n-sim-events ' + str( args.n_sim_events) + ' --n-leaves ' + str( args.n_leaves ) + ' --rearrange-from-scratch --outfname ' + simfname if args.n_leaf_distribution is None: cmd_str += ' --constant-number-of-leaves' else: cmd_str += ' --n-leaf-distribution ' + args.n_leaf_distribution if args.mut_mult is not None: cmd_str += ' --mutation-multiplier ' + str(args.mut_mult) cmd_str += ' --n-procs ' + str(args.n_procs) if args.slurm: cmd_str += ' --batch-system slurm --subsimproc' # figure what genes we're using if args.gen_gset: cmd_str += ' --generate-germline-set' cmd_str += ' --n-genes-per-region 1:5:3' cmd_str += ' --n-alleles-per-gene 2,3:1,2:1,2' else: simulation_genes = ':'.join(args.sim_v_genes + args.dj_genes) sglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=simulation_genes.split(':')) added_snp_names = None if args.snp_positions is not None: snps_to_add = [{ 'gene': args.sim_v_genes[ig], 'positions': args.snp_positions[ig] } for ig in range(len(args.sim_v_genes))] added_snp_names = glutils.add_some_snps( snps_to_add, sglfo, debug=True, remove_template_genes=args.remove_template_genes) if args.allele_prevalence_freqs is not None: if len(args.allele_prevalence_freqs) != len( sglfo['seqs']['v'] ): # already checked when parsing args, but, you know... raise Exception( '--allele-prevalence-freqs not the right length') gene_list = sorted( sglfo['seqs']['v']) if added_snp_names is None else list( set(args.sim_v_genes)) + added_snp_names prevalence_freqs = { 'v': { g: f for g, f in zip(gene_list, args.allele_prevalence_freqs) }, 'd': {}, 'j': {} } glutils.write_allele_prevalence_freqs( prevalence_freqs, args.workdir + '/allele-prevalence-freqs.csv') cmd_str += ' --allele-prevalence-fname ' + args.workdir + '/allele-prevalence-freqs.csv' print ' simulating with %d v: %s' % (len( sglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in sglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/simulation', sglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/simulation' # run simulation if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str) # remove any old sw cache files sw_cachefiles = glob.glob(outpdir + '/sw-cache-*.csv') if len(sw_cachefiles) > 0: for cachefname in sw_cachefiles: check_call(['rm', '-v', cachefname]) sw_cache_gldir = cachefname.replace('.csv', '-glfo') if os.path.exists( sw_cache_gldir ): # if stuff fails halfway through, you can get one but not the other glutils.remove_glfo_files(sw_cache_gldir, locus) # os.rmdir(sw_cache_gldir) # generate germline set and cache parameters cmd_str = base_cmd + ' cache-parameters --infname ' + simfname + ' --only-smith-waterman --debug-allele-finding --always-find-new-alleles --n-max-allele-finding-iterations 2' # --dont-collapse-clones' # cmd_str = 'python -m cProfile -s tottime -o prof.out ' + cmd_str cmd_str += ' --n-procs ' + str(args.n_procs) if args.n_max_queries is not None: cmd_str += ' --n-max-queries ' + str( args.n_max_queries ) # NOTE do *not* use --n-random-queries, since it'll change the cluster size distribution if args.slurm: cmd_str += ' --batch-system slurm' cmd_str += ' --find-new-alleles' if args.gen_gset: pass # i.e. uses default (full) germline dir else: cmd_str += ' --dont-remove-unlikely-alleles' # --new-allele-fname ' + args.outdir + '/new-alleles.fa' inference_genes = ':'.join(args.inf_v_genes + args.dj_genes) iglfo = glutils.read_glfo('data/germlines/human', locus=locus, only_genes=inference_genes.split(':')) print ' starting inference with %d v: %s' % (len( iglfo['seqs']['v']), ' '.join( [utils.color_gene(g) for g in iglfo['seqs']['v']])) glutils.write_glfo(args.outdir + '/germlines/inference', iglfo) cmd_str += ' --initial-germline-dir ' + args.outdir + '/germlines/inference' # cmd_str += ' --n-max-snps 12' cmd_str += ' --parameter-dir ' + outpdir cmd_str += ' --only-overall-plots --plotdir ' + plotdir if args.seed is not None: cmd_str += ' --seed ' + str(args.seed) run(cmd_str)