def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename(infname.replace(utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper() cmds = getpathcmd() cmds += ['conda activate %s' % args.env_label] cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir # if args.glfo_dir is not None else 'data/germlines/ XXX human' # can probably delete this now that --glfo-dir is required (but leaving for now, to show how it used to be in case it comes up) glfo = glutils.create_glfo_from_fasta(igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def get_gls_fname( outdir, method, locus, sim_truth=False, data=False, annotation_performance_plots=False ): # NOTE duplicates/depends on code in test-germline-inference.py if annotation_performance_plots: return outdir + '/' + method + '/annotation-performance-plots/sw/mutation' if data: if method == 'partis' or method == 'full': outdir += '/hmm/germline-sets' # NOTE this is inside the datascripts output dir, also NOTE doesn't use <method> (since we only have partis for a method a.t.m., although could use --label or --extra-str to differentiate) else: outdir += '/' + method elif sim_truth: outdir += '/germlines/simulation' elif method == 'partis' or method == 'full': outdir += '/' + method + '/sw/germline-sets' elif 'tigger' in method or method == 'igdiscover': outdir += '/' + method else: assert False return glutils.get_fname(outdir, locus, region)
def parse_ramesh_seqs(glseqs, outdir, debug=False): for locus in glseqs: glutils.remove_glfo_files(outdir, locus) # write to a glfo dir without extra info for region in glseqs[locus]: fn = glutils.get_fname(outdir, locus, region) if not os.path.exists(os.path.dirname(fn)): os.makedirs(os.path.dirname(fn)) with open(fn, 'w') as ofile: for gene, seq in glseqs[locus][region].items(): ofile.write('>%s\n%s\n' % (gene, seq)) # figure out extra info template_glfo = glutils.read_glfo('data/germlines/macaque', locus) glfo = glutils.read_glfo(outdir, locus, template_glfo=template_glfo, remove_bad_genes=True, debug=True) # trim non-coding stuff upstream of v (and remove non-full-length ones) gene_groups = {} for region in ['v']: group_labels = sorted( set([utils.gene_family(g) for g in glfo['seqs'][region]])) gene_groups[region] = [(glabel, { g: glfo['seqs'][region][g] for g in glfo['seqs'][region] if utils.gene_family(g) == glabel }) for glabel in group_labels] for region in [r for r in utils.regions if r in gene_groups]: if debug: print '%s' % utils.color('reverse_video', utils.color('green', region)) for group_label, group_seqs in gene_groups[ region]: # ok, this isn't really doing anything any more if debug: print ' %s' % utils.color('blue', group_label) for gene, seq in group_seqs.items(): trim_and_remove_genes(region, gene, seq, glfo, template_glfo, debug=debug) # remove any seqs with ambiguous bases for region in [r for r in utils.regions if r in glfo['seqs']]: for gene, seq in glfo['seqs'][region].items(): if utils.ambig_frac(seq) > 0.: if debug: print ' %d ambiguous bases: %s' % ( len(seq) * utils.ambig_frac(seq), utils.color_gene(gene)) glutils.remove_gene(glfo, gene) # glutils.print_glfo(glfo) # write final result glutils.write_glfo(outdir, glfo, debug=True)
def get_outfname(args, method, annotation_performance_plots=False, return_parent_gl_dir=False): outdir = args.outdir + '/' + method if not annotation_performance_plots: # default: output is igh/ighv.fasta if method == 'partis' or method == 'full': # parameter directory, not regular file (although, could change it to the gls .fa in sw/) outdir += '/sw/germline-sets' if not return_parent_gl_dir: return glutils.get_fname(outdir, args.locus, 'v') else: return outdir else: # product of running partis annotation with --plot-annotation-performance return outdir + '/annotation-performance-plots'
def run_igdiscover(infname, outfname, outdir): if utils.output_exists(args, outfname): return prepare_igdiscover_outdir(outdir) if args.n_random_queries is not None: sub_infname = outdir + '/' + os.path.basename( infname.replace( utils.getsuffix(infname), '-n-random-queries-%d%s' % (args.n_random_queries, utils.getsuffix(infname)))) if os.path.exists(sub_infname): print ' --n-random-queries: leaving existing fasta for igdiscover (hopefully it has %d queries)' % args.n_random_queries else: print ' --n-random-queries: writing new fasta for igdiscover (%d queries)' % args.n_random_queries seqfos = utils.read_fastx(infname, n_random_queries=args.n_random_queries) with open(sub_infname, 'w') as sub_infile: for seqfo in seqfos: sub_infile.write('>%s\n%s\n' % (seqfo['name'], seqfo['seq'])) infname = sub_infname igdiscover_outfname = outdir + '/work/final/database/%s.fasta' % args.region.upper( ) cmds = ['#!/bin/bash'] cmds += ['export PATH=%s:$PATH' % args.condapath] cmds += [ 'export PYTHONNOUSERSITE=True' ] # otherwise it finds the pip-installed packages in .local and breaks (see https://github.com/conda/conda/issues/448) cmds += ['cd %s' % outdir] cmds += ['igdiscover init --db db --single-reads %s work' % infname ] # prepares to run, putting files into <outdir> cmds += ['cp %s work/' % os.path.basename(args.yamlfname)] cmds += ['cd work'] cmds += ['igdiscover run'] utils.simplerun('\n'.join(cmds) + '\n', cmdfname=outdir + '/run.sh', print_time='igdiscover', debug=True) template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( igdiscover_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo, debug=True)
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = ['library(tigger)', 'library(dplyr)'] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += ['%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True))] tigger_outfname = outdir + '/tigger.fasta' rcmds += ['novel_df = findNovelAlleles(%s, %s, germline_min=2, nproc=%d)' % (db_name, gls_name, args.n_procs)] # rcmds += ['geno = inferGenotype(%s, find_unmutated = FALSE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name)] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname utils.simplerun(cmdstr, shell=True, print_time='tigger') # post-process tigger .fa gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.read_glfo(gldir, args.locus) tigger_alleles = set() for seqfo in utils.read_fastx(tigger_outfname): seq = seqfo['seq'].replace(utils.gap_chars[0], '') # it should be just dots... tigger_alleles.add(seqfo['name']) if seqfo['name'] not in glfo['seqs'][args.region]: newfo = {'gene' : seqfo['name'], 'seq' : seq} use_template_for_codon_info = False if '+' in newfo['gene']: newfo['template-gene'] = newfo['gene'].split('+')[0] use_template_for_codon_info = True glutils.add_new_allele(glfo, newfo, use_template_for_codon_info=use_template_for_codon_info, debug=True) elif glfo['seqs'][args.region][seqfo['name']] != seq: print '%s different sequences in glfo and tigger output for %s:\n %s\n %s' % (utils.color('red', 'error'), seqfo['name'], glfo['seqs'][args.region][seqfo['name']], seqfo['seq']) for gene in glfo['seqs'][args.region]: # remove them afterwards so we can use existing ones to get codon info if gene not in tigger_alleles: glutils.remove_gene(glfo, gene) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def prepare_igdiscover_outdir(outdir): if not os.path.exists(outdir): os.makedirs(outdir) if os.path.exists(outdir + '/db'): for fn in [get_igd_glsfname(outdir, r) for r in utils.regions]: if os.path.exists(fn): os.remove(fn) else: os.makedirs(outdir + '/db') for region in utils.regions: targetname = glutils.get_fname(args.glfo_dir, args.locus, region) linkname = get_igd_glsfname(outdir, region) if region in utils.getregions(args.locus): if not os.path.exists(targetname): raise Exception('gl file %s d.n.e.' % targetname) if not os.path.islink(linkname): subprocess.check_call(['ln', '-s', targetname, linkname]) else: with open(linkname, 'w') as dummy_d_file: dummy_d_file.write('>%sDx-x*x\n%s\n' % (args.locus.upper(), 'aa')) cfgfname = outdir + '/' + os.path.basename( args.yamlfname ) # this is the .yaml in igdiscover/ (but *not* in igdiscover/work/) have to write it in the parent workdir, then cp to work/, because... meh, who cares why, just do it like this so shit works if os.path.exists(cfgfname): os.remove(cfgfname) with open( args.yamlfname ) as cfgfile: # whereas this is the template .yaml in partis/test/ cfgdata = yaml.load(cfgfile) if True: #not args.gls_gen: for filtername in ['pre_germline_filter', 'germline_filter']: for cfgvar in ['unique_js', 'unique_cdr3s']: cfgdata[filtername][cfgvar] = 0 if args.species != 'human': if args.species == 'macaque': cfgdata['species'] = 'rhesus_monkey' else: assert False with open(cfgfname, 'w') as cfgfile: yaml.dump(cfgdata, cfgfile, width=200) if os.path.exists( outdir + '/work' ): # sigh, it spams out too much different output, can't get away without a '-r' subprocess.check_call(['rm', '-r', outdir + '/work'])
def get_gls_fname( region, outdir, method, locus, sim_truth=False, data=False, annotation_performance_plots=False ): # NOTE duplicates/depends on code in test-germline-inference.py if annotation_performance_plots: return outdir + '/' + method + '/annotation-performance-plots/sw/mutation' gls_dir = get_gls_dir( outdir, method, sim_truth=sim_truth, data=data, annotation_performance_plots=annotation_performance_plots) return glutils.get_fname(gls_dir, locus, region)
def get_gls_fname( outdir, method, locus, sim_truth=False, data=False ): # NOTE duplicates/depends on code in test-allele-finding.py if data: if method == 'partis' or method == 'full': outdir += '/hmm/germline-sets' # NOTE this is inside the datascripts output dir, also NOTE doesn't use <method> (since we only have partis for a method a.t.m., although could use --label or --extra-str to differentiate) else: outdir += '/' + method elif sim_truth: outdir += '/germlines/simulation' elif method == 'partis' or method == 'full': outdir += '/' + method + '/sw/germline-sets' elif method == 'tigger': outdir += '/' + method else: assert False return glutils.get_fname(outdir, locus, region)
def prepare_igdiscover_outdir(outdir): if not os.path.exists(outdir): os.makedirs(outdir) if os.path.exists(outdir + '/db'): for fn in [get_igd_glsfname(outdir, r) for r in utils.regions]: if os.path.exists(fn): os.remove(fn) else: os.makedirs(outdir + '/db') for region in utils.regions: subprocess.check_call([ 'ln', '-s', glutils.get_fname(args.glfo_dir, args.locus, region), get_igd_glsfname(outdir, region) ]) cfgfname = outdir + '/' + os.path.basename( args.yamlfname ) # this is the .yaml in igdiscover/ (but *not* in igdiscover/work/) have to write it in the parent workdir, then cp to work/, because... meh, who cares why, just do it like this so shit works if os.path.exists(cfgfname): os.remove(cfgfname) with open( args.yamlfname ) as cfgfile: # whereas this is the template .yaml in partis/test/ cfgdata = yaml.load(cfgfile) if True: #not args.gls_gen: for filtername in ['pre_germline_filter', 'germline_filter']: for cfgvar in ['unique_js', 'unique_cdr3s']: cfgdata[filtername][cfgvar] = 0 with open(cfgfname, 'w') as cfgfile: yaml.dump(cfgdata, cfgfile, width=200) if os.path.exists( outdir + '/work' ): # sigh, it spams out too much different output, can't get away without a -r subprocess.check_call(['rm', '-r', outdir + '/work'])
def run_tigger(infname, outfname, outdir): if utils.output_exists(args, outfname, offset=8): return rcmds = [ 'library(ggplot2)', 'library(tigger, warn.conflicts=FALSE)', 'library(dplyr, warn.conflicts=FALSE)' ] # rcmds += ['data(sample_db, germline_ighv)'] db_name = 'annotations' gls_name = 'gls' rcmds += ['%s = read.csv("%s", sep="\t")' % (db_name, infname)] rcmds += [ '%s = readIgFasta("%s")' % (gls_name, get_glfname('v', aligned=True)) ] tigger_outfname = outdir + '/tigger.fasta' find_novel_argstr = '%s, %s, nproc=%d' % (db_name, gls_name, utils.auto_n_procs()) if args.tuned_tigger_params: germline_min = 5 # only analyze genes which correspond to at least this many V calls (default 200) min_seqs = 5 # minimum number of total sequences j_max = 0.95 # of sequences which align perfectly (i.e. zero mutation?) to a new allele, no more than this fraction can correspond to each junction length + j gene combination (default 0.15) find_novel_argstr += ', germline_min=%d, min_seqs=%d, j_max=%f' % ( germline_min, min_seqs, j_max) rcmds += ['novel_df = findNovelAlleles(%s)' % find_novel_argstr] # rcmds += ['sessionInfo()'] rcmds += ['print(novel_df)'] rcmds += [ 'geno = inferGenotype(%s, find_unmutated = TRUE, germline_db = %s, novel_df = novel_df)' % (db_name, gls_name) ] rcmds += ['genotype_seqs = genotypeFasta(geno, %s, novel_df)' % (gls_name)] rcmds += ['writeFasta(genotype_seqs, "%s")' % tigger_outfname] cmdfname = args.workdir + '/tigger-in.cmd' with open(cmdfname, 'w') as cmdfile: cmdfile.write('\n'.join(rcmds) + '\n') cmdstr = 'R --slave -f ' + cmdfname cmdfo = {'cmd_str': cmdstr, 'logdir': args.workdir, 'env': os.environ} proc = utils.run_cmd(cmdfo) while proc.poll() is None: time.sleep(0.01) if proc.returncode != 0: # damn thing crashes if it thinks the sample size is small with open(args.workdir + '/err') as ferr: errstr = ''.join(ferr.readlines()) if 'Not enough sample sequences were assigned to any germline' in errstr: with open(tigger_outfname, 'w') as dummy_outfasta: dummy_outfasta.write('') else: subprocess.check_call(['cat', args.workdir + '/out']) subprocess.check_call(['cat', args.workdir + '/err']) sys.exit(proc.returncode) for oe in ['err', 'out']: with open(args.workdir + '/' + oe) as oefile: print ''.join(oefile.readlines()) os.remove(args.workdir + '/' + oe) # post-process tigger .fa template_gldir = args.glfo_dir if args.glfo_dir is not None else 'data/germlines/human' glfo = glutils.create_glfo_from_fasta( tigger_outfname, args.locus, args.region, template_gldir, simulation_germline_dir=args.simulation_germline_dir) out_gldir = os.path.dirname(outfname).rstrip('/' + args.locus) assert glutils.get_fname(out_gldir, args.locus, args.region) == outfname glutils.write_glfo(out_gldir, glfo) os.remove(cmdfname)
def get_outfname(args, method): outdir = args.outdir + '/' + method if method == 'partis' or method == 'full': # parameter directory, not regular file (although, could change it to the gls .fa in sw/) outdir += '/sw/germline-sets' return glutils.get_fname(outdir, args.locus, 'v')