def task_phasing(self): ref_fasta = fn(self.ref_fasta) aln_bam = fn(self.aln_bam) job_done = fn(self.job_done) job_uid = self.parameters['job_uid'] wd = self.parameters['wd'] ctg_id = self.parameters['ctg_id'] config = self.parameters['config'] smrt_bin = config['smrt_bin'] samtools = os.path.join(smrt_bin, 'samtools') script_fn = os.path.join(wd, 'p_%s.sh' % (ctg_id)) script = """\ set -vex trap 'touch {job_done}.exit' EXIT cd {wd} hostname date cd {wd} fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir .. --samtools {samtools} fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def task_phasing(self): ref_fasta = fn(self.ref_fasta) aln_bam = fn(self.aln_bam) job_done = fn(self.job_done) job_uid = self.parameters['job_uid'] wd = self.parameters['wd'] ctg_id = self.parameters['ctg_id'] config = self.parameters['config'] smrt_bin = config['smrt_bin'] samtools = os.path.join(smrt_bin, 'samtools') script_fn = os.path.join(wd , 'p_%s.sh' % (ctg_id)) script = """\ set -vex trap 'touch {job_done}.exit' EXIT cd {wd} hostname date cd {wd} fc_phasing.py --bam {aln_bam} --fasta {ref_fasta} --ctg_id {ctg_id} --base_dir .. --samtools {samtools} fc_phasing_readmap.py --ctg_id {ctg_id} --read_map_dir ../../../2-asm-falcon/read_maps --phased_reads phased_reads date touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters): tasks = list() next_inputs = dict() topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)), 'run-fastas2fofn') # Create the fastas in parallel. for i, chunk_fn in enumerate(open(fn(split_subreadsets_fofn_pfn)).read().splitlines()): wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i)) # TODO: 02 chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn)) fasta_done_fn = os.path.join(wdir, 'chunk_{:03d}_done'.format(i)) # TODO: 02 # By depending on a sentinel, we are allowed to delete fastas later. # Note: i might not match num in chunk_fn, but that is ok fasta_done_pfn = makePypeLocalFile(fasta_done_fn) make_task = PypeTask( inputs = {"dataset": chunk_pfn, }, outputs = {"fasta_done": fasta_done_pfn, }, parameters = parameters, ) task = make_task(start_task.task_bam2fasta_dexta) tasks.append(task) next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn #fasta_fn = base_from_done(fasta_done_fn) + '.fasta' # By convention. # Create the FOFN of fastas. fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn') fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn) make_task = PypeTask( inputs = next_inputs, outputs = {"fofn": fasta_fofn_pfn, }, parameters = parameters, ) task = make_task(start_task.task_fastas2fofn) tasks.append(task) return tasks, fasta_fofn_pfn
def task_scatter_quiver(self): p_ctg_fn = fn(self.p_ctg_fa) h_ctg_fn = fn(self.h_ctg_fa) out_json = fn(self.scattered_quiver_json) track_reads_h_done_fn = fn(self.track_reads_h_done) bam_dir = os.path.dirname(track_reads_h_done_fn) config = self.parameters['config'] ref_seq_data = {} # I think this will crash if the file is empty. Maybe that is ok. p_ctg_fa = FastaReader(p_ctg_fn) ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'p' # I think this will crash if the file is empty. Maybe that is ok. h_ctg_fa = FastaReader(h_ctg_fn) for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = 'h' ctg_ids = sorted(ref_seq_data.keys()) #p_ctg_out=[] #h_ctg_out=[] #job_done_plfs = {} jobs = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), m_ctg_id) ref_fasta = os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id)) read_bam = os.path.join(bam_dir, '{ctg_id}.bam'.format(ctg_id = ctg_id)) #cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id))) #cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id))) #job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id))) if os.path.exists(read_bam): # *.sam are created in task_track_reads, fc_select_reads_from_bam.py # Network latency should not matter because we have already waited for the 'done' file. mkdir(wd) if not os.path.exists(ref_fasta): # TODO(CD): Up to 50MB of seq data. Should do this on remote host. # See https://github.com/PacificBiosciences/FALCON_unzip/issues/59 with open(ref_fasta,'w') as f: print >>f, '>'+ctg_id print >>f, sequence new_job = {} new_job['ctg_id'] = ctg_id new_job['ctg_types'] = ctg_types new_job['smrt_bin'] = config['smrt_bin'] new_job['sge_option'] = config['sge_quiver'] new_job['ref_fasta'] = ref_fasta new_job['read_bam'] = read_bam jobs.append(new_job) open(out_json, 'w').write(json.dumps(jobs))
def task_track_reads(self): input_bam_fofn = fn(self.input_bam_fofn) job_done = fn(self.job_done) work_dir = os.getcwd() basedir = '../..' # assuming we are in ./4-quiver/reads/ script_fn = 'track_reads_h.sh' # For now, in/outputs are in various directories, by convention. script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {basedir} fc_get_read_hctg_map.py fc_rr_hctg_track.py fc_select_reads_from_bam.py {input_bam_fofn} date cd {work_dir} touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def task_hasm(self): rid_to_phase_all = fn(self.rid_to_phase_all) job_done = fn(self.job_done) config = self.parameters['config'] sge_hasm = config['sge_hasm'] wd = self.parameters['wd'] job_type = config['job_type'] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, 'hasm.sh') las_fofn = '../../2-asm-falcon/las.fofn' las_fofn = '../../1-preads_ovl/merge-gather/las.fofn' script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {wd} fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 12 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log if [ -e ../../1-preads_ovl/preads4falcon.fasta ]; then ln -sf ../../1-preads_ovl/preads4falcon.fasta . else ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta . fi fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta # more script -- a little bit hacky here, we should improve WD=$PWD for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done ## prepare for quviering the haplotig cd $WD/.. if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa cd ../ date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def get_phased_reads(self): q_id_map_fn = fn(self.q_id_map_file) vmap_fn = fn(self.vmap_file) p_variant_fn = fn(self.phased_variant_file) parameters = self.parameters ctg_id = parameters["ctg_id"] phased_read_fn = fn(self.phased_read_file) rid_map = {} with open(q_id_map_fn) as f: for l in f: l = l.strip().split() rid_map[int(l[0])] = l[1] read_to_variants = {} variant_to_reads = {} with open(vmap_fn) as f: for l in f: l = l.strip().split() variant = "_".join(l[:3]) read_id = int(l[3]) read_to_variants.setdefault(read_id, set()) read_to_variants[read_id].add(variant) variant_to_reads.setdefault(variant, set()) variant_to_reads[variant].add(read_id) variant_to_phase = {} with open(p_variant_fn) as f: for l in f: """line format example: V 1 6854 6854_A_A 6854_A_G 6854 22781""" l = l.strip().split() if l[0] != "V": continue pb_id = int(l[1]) variant_to_phase[ l[3] ] = (pb_id, 0) variant_to_phase[ l[4] ] = (pb_id, 1) with open(phased_read_fn, "w") as out_f: for r in read_to_variants: vl = {} pl = set() for v in list( read_to_variants[r] ): if v in variant_to_phase: p = variant_to_phase[v] vl[ p ] = vl.get(p, 0) + 1 pl.add(p[0]) pl = list(pl) pl.sort() for p in pl: if vl.get( (p,0), 0) - vl.get( (p,1), 0) > 1: print >> out_f, r, ctg_id, p, 0, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r] elif vl.get( (p,1), 0) - vl.get( (p,0), 0) > 1: print >> out_f, r, ctg_id, p, 1, vl.get( (p,0), 0), vl.get( (p,1), 0), rid_map[r]
def task_hasm(self): rid_to_phase_all = fn(self.rid_to_phase_all) job_done = fn(self.job_done) #config = self.parameters['config'] wd = self.parameters['wd'] script_fn = os.path.join(wd , 'hasm.sh') las_fofn = '../../2-asm-falcon/las.fofn' las_fofn = '../../1-preads_ovl/merge-gather/las.fofn' script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {wd} fc_ovlp_filter_with_phase.py --fofn {las_fofn} --max_diff 120 --max_cov 120 --min_cov 1 --n_core 48 --min_len 2500 --db ../../1-preads_ovl/preads.db --rid_phase_map {rid_to_phase_all} > preads.p_ovl fc_phased_ovlp_to_graph.py preads.p_ovl --min_len 2500 > fc.log if [ -e ../../1-preads_ovl/preads4falcon.fasta ]; then ln -sf ../../1-preads_ovl/preads4falcon.fasta . else ln -sf ../../1-preads_ovl/db2falcon/preads4falcon.fasta . fi fc_graphs_to_h_tigs.py --fc_asm_path ../../2-asm-falcon/ --fc_hasm_path ./ --ctg_id all --rid_phase_map {rid_to_phase_all} --fasta preads4falcon.fasta # more script -- a little bit hacky here, we should improve WD=$PWD for f in `cat ../reads/ctg_list `; do mkdir -p $WD/$f; cd $WD/$f; fc_dedup_h_tigs.py $f; done ## prepare for quviering the haplotig cd $WD/.. if [ -e "all_phased_reads" ]; then rm all_phased_reads; fi if [ -e "all_h_ctg_ids" ]; then rm all_h_ctg_ids; fi if [ -e "all_p_ctg_edges" ]; then rm all_p_ctg_edges; fi if [ -e "all_p_ctg.fa" ]; then rm all_p_ctg.fa; fi if [ -e "all_h_ctg.fa" ]; then rm all_h_ctg.fa; fi find 0-phasing -name "phased_reads" | sort | xargs cat >> all_phased_reads find 1-hasm -name "h_ctg_ids.*" | sort | xargs cat >> all_h_ctg_ids find 1-hasm -name "p_ctg_edges.*" | sort | xargs cat >> all_p_ctg_edges find 1-hasm -name "h_ctg_edges.*" | sort | xargs cat >> all_h_ctg_edges find 1-hasm -name "p_ctg.*.fa" | sort | xargs cat >> all_p_ctg.fa find 1-hasm -name "h_ctg.*.fa" | sort | xargs cat >> all_h_ctg.fa cd ../ date touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def taskA(self): i1 = fn(self.i1) o1 = fn(self.o1) script = """ set -vex cat {i1} > {o1} """.format(**locals()) script_fn = 'script.sh' with open(script_fn, 'w') as ofs: ofs.write(script) self.generated_script_fn = script_fn
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters): """Create a pbalign task for each chunk, plus a gathering task. """ tasks = list() gathering = dict() chunk_dir = os.path.dirname(fn(chunk_json_pfn)) for i, subreadset_fn in enumerate( sorted( yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)), '$chunk.subreadset_id'))): wdir = 'run-pbalign-{:02d}'.format(i) subreadset_fn = os.path.join(chunk_dir, os.path.basename(subreadset_fn)) subreadset_pfn = makePypeLocalFile(subreadset_fn) unmapped_pfn = makePypeLocalFile( '{wdir}/unmapped.txt'.format(**locals())) alignmentset_pfn = makePypeLocalFile( '{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format( **locals())) gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn """Also produces: aligned.subreads.i.alignmentset.bam aligned.subreads.i.alignmentset.bam.bai aligned.subreads.i.alignmentset.bam.pbi """ make_task = PypeTask( inputs={ "chunk_json": chunk_json_pfn, "dataset": subreadset_pfn, "referenceset": referenceset_pfn, }, outputs={ "alignmentset": alignmentset_pfn, "unmapped": unmapped_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign) tasks.append(task) o_alignmentset_pfn = makePypeLocalFile( 'run-pbalign_gather/aligned.subreads.alignmentset.xml') o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt') make_task = PypeTask( inputs=gathering, outputs={ "o_ds": o_alignmentset_pfn, "o_unmapped": o_unmapped_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign_gather) tasks.append(task) return tasks, alignmentset_pfn
def get_rid_to_phase_all(self): # Tasks must be at module scope now. rid_to_phase_all_fn = fn(self.rid_to_phase_all) inputs_fn = [fn(f) for f in self.inputs.values()] inputs_fn.sort() output = [] for fname in inputs_fn: output.extend(open(fname).read()) out = open(rid_to_phase_all_fn, 'w') out.write(''.join(output)) out.close()
def get_rid_to_phase_all(self): # Tasks must be at module scope now. rid_to_phase_all_fn = fn(self.rid_to_phase_all) inputs_fn = [ fn(f) for f in self.inputs.values() ] inputs_fn.sort() output = [] for fname in inputs_fn: output.extend(open(fname).read()) out = open(rid_to_phase_all_fn, 'w') out.write(''.join(output)) out.close()
def say_hey1(self): o1 = fn(self.o1) i0 = fn(self.i0) script = """\ #!/bin/bash echo hey1 touch %(o1)s """ % locals() script_fn = 'run-hey.sh' with open(script_fn, 'w') as ofs: ofs.write(script) self.generated_script_fn = script_fn
def task_run_quiver(self): ref_fasta = fn(self.ref_fasta) read_sam = fn(self.read_sam) cns_fasta = fn(self.cns_fasta) cns_fastq = fn(self.cns_fastq) job_done = fn(self.job_done) job_uid = self.parameters['job_uid'] wd = self.parameters['wd'] config = self.parameters['config'] ctg_id = self.parameters['ctg_id'] smrt_bin = config['smrt_bin'] sge_quiver = config['sge_quiver'] job_type = config['job_type'] samtools = os.path.join(smrt_bin, 'samtools') pbalign = os.path.join(smrt_bin, 'pbalign') makePbi = os.path.join(smrt_bin, 'makePbi') variantCaller = os.path.join(smrt_bin, 'variantCaller') script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, 'cns_%s.sh' % (ctg_id)) script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {wd} {samtools} faidx {ref_fasta} {samtools} view -b -S {read_sam} > {ctg_id}.bam {pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\ --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\ --algorithmOptions=-useQuality --maxHits=1 --hitPolicy=random --seed=1\ {ctg_id}.bam {ref_fasta} aln-{ctg_id}.bam #{makePbi} --referenceFasta {ref_fasta} aln-{ctg_id}.bam ({variantCaller} -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\ -o {cns_fasta} -o {cns_fastq}) || echo quvier failed date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def task_track_reads(self): job_done = fn(self.job_done) wd = self.parameters['wd'] #config = self.parameters['config'] script_fn = os.path.join(wd, 'track_reads.sh') topdir = '../..' script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {topdir} python -m falcon_kit.mains.get_read_ctg_map python -m falcon_kit.mains.rr_ctg_track python -m falcon_kit.mains.pr_ctg_track #mkdir -p 3-unzip/reads/ python -m falcon_kit.mains.fetch_reads cd {wd} date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def task_track_reads(self): job_done = fn(self.job_done) wd = self.parameters['wd'] config = self.parameters['config'] input_bam_fofn = config['input_bam_fofn'] sge_track_reads = config['sge_track_reads'] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, 'track_reads_h.sh') script = """\ set -vex trap 'touch {job_done}.exit' EXIT cd {wd} hostname date fc_get_read_hctg_map.py --basedir ../.. fc_rr_hctg_track.py --base_dir ../.. mkdir -p 4-quiver/reads/ fc_select_reads_from_bam.py --basedir ../.. {input_bam_fofn} date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def touchit(self): out = fn(self.out) s = 1 LOG.info('sleep {}'.format(s)) time.sleep(s) cmd = 'touch {}'.format(out) system(cmd)
def task_track_reads(self): job_done = fn(self.job_done) wd = self.parameters['wd'] #config = self.parameters['config'] script_fn = os.path.join(wd , 'track_reads.sh') topdir = '../..' script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date cd {topdir} python -m falcon_kit.mains.get_read_ctg_map python -m falcon_kit.mains.rr_ctg_track python -m falcon_kit.mains.pr_ctg_track #mkdir -p 3-unzip/reads/ python -m falcon_kit.mains.fetch_reads cd {wd} date touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters): tasks = list() next_inputs = dict() topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)), 'run-fastas2fofn') # Create the fastas in parallel. for i, chunk_fn in enumerate( open(fn(split_subreadsets_fofn_pfn)).read().splitlines()): wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i)) # TODO: 02 chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn)) fasta_done_fn = os.path.join(wdir, 'chunk_{:03d}_done'.format(i)) # TODO: 02 # By depending on a sentinel, we are allowed to delete fastas later. # Note: i might not match num in chunk_fn, but that is ok fasta_done_pfn = makePypeLocalFile(fasta_done_fn) make_task = PypeTask( inputs={ "dataset": chunk_pfn, }, outputs={ "fasta_done": fasta_done_pfn, }, parameters=parameters, ) task = make_task(start_task.task_bam2fasta_dexta) tasks.append(task) next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn #fasta_fn = base_from_done(fasta_done_fn) + '.fasta' # By convention. # Create the FOFN of fastas. fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn') fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn) make_task = PypeTask( inputs=next_inputs, outputs={ "fofn": fasta_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_fastas2fofn) tasks.append(task) return tasks, fasta_fofn_pfn
def task_run_blasr(self): job_done = fn(self.job_done) ref_fasta = fn(self.ref_fasta) read_fasta = fn(self.read_fasta) job_uid = self.parameters['job_uid'] wd = self.parameters['wd'] ctg_id = self.parameters['ctg_id'] config = self.parameters['config'] smrt_bin = config['smrt_bin'] sge_blasr_aln = config['sge_blasr_aln'] job_type = config['job_type'] blasr = os.path.join(smrt_bin, 'blasr') samtools = os.path.join(smrt_bin, 'samtools') script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, 'aln_{ctg_id}.sh'.format(ctg_id=ctg_id)) script = """\ set -vex trap 'touch {job_done}.exit' EXIT cd {wd} hostname date cd {wd} time {blasr} {read_fasta} {ref_fasta} -noSplitSubreads -clipping subread\ -hitPolicy randombest -randomSeed 42 -bestn 1 -minPctIdentity 70.0\ -minMatch 12 -nproc 24 -sam -out tmp_aln.sam {samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted {samtools} index {ctg_id}_sorted.bam rm tmp_aln.sam date touch {job_done} """.format(**locals()) with open(script_fn, 'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split('\n') rid_to_oid = open(rawread_id_file).read().split('\n') asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, 'w') as f: for ctg in asm_G.ctg_data: if ctg[-1] == 'R': continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(':')[0]) rid = pread_did_to_rid[pid].split('/')[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
def task_run_quiver(self): ref_fasta = fn(self.ref_fasta) read_bam = fn(self.read_bam) cns_fasta = fn(self.cns_fasta) cns_fastq = fn(self.cns_fastq) job_done = fn(self.job_done) job_uid = self.parameters['job_uid'] ctg_id = self.parameters['ctg_id'] smrt_bin = self.parameters['smrt_bin'] samtools = os.path.join(smrt_bin, 'samtools') pbalign = os.path.join(smrt_bin, 'pbalign') makePbi = os.path.join(smrt_bin, 'makePbi') variantCaller = os.path.join(smrt_bin, 'variantCaller') script_fn = 'cns_%s.sh' % (ctg_id) script = """\ set -vex trap 'touch {job_done}.exit' EXIT hostname date {samtools} faidx {ref_fasta} {pbalign} --tmpDir=/localdisk/scratch/ --nproc=24 --minAccuracy=0.75 --minLength=50\ --minAnchorSize=12 --maxDivergence=30 --concordant --algorithm=blasr\ --algorithmOptions=--useQuality --maxHits=1 --hitPolicy=random --seed=1\ {read_bam} {ref_fasta} aln-{ctg_id}.bam ({variantCaller} --algorithm=arrow -x 5 -X 120 -q 20 -j 24 -r {ref_fasta} aln-{ctg_id}.bam\ -o {cns_fasta} -o {cns_fastq}) || echo quvier failed date touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def say_hey0(self): o0 = fn(self.o0) print 'hey', o0 script = """\ #!/bin/bash echo hey0 touch %(o0)s """ % locals() script_fn = 'run-hey.sh' with open(script_fn, 'w') as ofs: ofs.write(script) self.generated_script_fn = script_fn
def task_run_blasr(self): job_done = fn(self.job_done) ref_fasta = fn(self.ref_fasta) read_fasta = fn(self.read_fasta) job_uid = self.parameters['job_uid'] wd = self.parameters['wd'] ctg_id = self.parameters['ctg_id'] config = self.parameters['config'] smrt_bin = config['smrt_bin'] blasr = os.path.join(smrt_bin, 'blasr') samtools = os.path.join(smrt_bin, 'samtools') script_dir = os.path.join(wd) script_fn = os.path.join(script_dir , 'aln_{ctg_id}.sh'.format(ctg_id = ctg_id)) script = """\ set -vex trap 'touch {job_done}.exit' EXIT cd {wd} hostname date cd {wd} time {blasr} {read_fasta} {ref_fasta} --noSplitSubreads --clipping subread\ --hitPolicy randombest --randomSeed 42 --bestn 1 --minPctIdentity 70.0\ --minMatch 12 --nproc 24 --bam --out tmp_aln.bam #{samtools} view -bS tmp_aln.sam | {samtools} sort - {ctg_id}_sorted {samtools} sort tmp_aln.bam -o {ctg_id}_sorted.bam {samtools} index {ctg_id}_sorted.bam rm tmp_aln.bam date touch {job_done} """.format(**locals()) with open(script_fn,'w') as script_file: script_file.write(script) self.generated_script_fn = script_fn
def create_quiver_jobs(scattered_quiver_plf): scattered_quiver_fn = fn(scattered_quiver_plf) jobs = json.loads(open(scattered_quiver_fn).read()) #ctg_ids = sorted(jobs['ref_seq_data']) p_ctg_out = [] h_ctg_out = [] job_done_plfs = {} for job in jobs: ctg_id = job['ctg_id'] m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id) ref_fasta = makePypeLocalFile( os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id=ctg_id))) if os.path.exists( fn(read_sam )): # TODO(CD): Ask Jason what we should do if missing SAM. if ctg_types[ctg_id] == 'p': p_ctg_out.append((fn(cns_fasta), fn(cns_fastq))) elif ctg_types[ctg_id] == 'h': h_ctg_out.append((fn(cns_fasta), fn(cns_fastq))) else: LOG.warning( 'Type is {!r}, not "p" or "h". Why are we running Quiver?'. format(ctg_types[ctg_id])) parameters = { 'job_uid': 'q-' + ctg_id, 'wd': wd, 'config': config, 'ctg_id': ctg_id } make_quiver_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'read_sam': read_sam, 'scattered_quiver': scattered_quiver_plf, }, outputs={ 'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done }, parameters=parameters, ) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) job_done_plfs['{}'.format(ctg_id)] = job_done #sge_quiver = config['sge_quiver'] return p_ctg_out, h_ctg_out, job_done_plfs
def taskrun1(self): template = """ sleep_s=%(sleep_s)s ifile=%(ifile)s ofile=%(ofile)s set -vex echo start1 sleep ${sleep_s} cp -f ${ifile} ${ofile} echo end1 """ bash = template % dict( ifile=fn(self.f0), ofile=fn(self.f1), sleep_s=self.parameters['sleep_s'], ) log.debug('taskrun1 bash:\n' + bash) script = 'taskrun1.sh' with open(script, 'w') as ofs: ofs.write(bash) #system("bash {}".format(script), check=True) self.generated_script_fn = script return script
def taskrun1(self): template = """ sleep_s=%(sleep_s)s ifile=%(ifile)s ofile=%(ofile)s set -vex echo start1 sleep ${sleep_s} cp -f ${ifile} ${ofile} echo end1 """ bash = template %dict( ifile=fn(self.f0), ofile=fn(self.f1), sleep_s=self.parameters['sleep_s'], ) log.debug('taskrun1 bash:\n' + bash) script = 'taskrun1.sh' with open(script, 'w') as ofs: ofs.write(bash) #system("bash {}".format(script), check=True) self.generated_script_fn = script return script
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters): """Create a pbalign task for each chunk, plus a gathering task. """ tasks = list() gathering = dict() chunk_dir = os.path.dirname(fn(chunk_json_pfn)) for i, subreadset_fn in enumerate(sorted(yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)), '$chunk.subreadset_id'))): wdir = 'run-pbalign-{:02d}'.format(i) subreadset_fn = os.path.join(chunk_dir, os.path.basename(subreadset_fn)) subreadset_pfn = makePypeLocalFile(subreadset_fn) unmapped_pfn = makePypeLocalFile('{wdir}/unmapped.txt'.format(**locals())) alignmentset_pfn = makePypeLocalFile('{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(**locals())) gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn """Also produces: aligned.subreads.i.alignmentset.bam aligned.subreads.i.alignmentset.bam.bai aligned.subreads.i.alignmentset.bam.pbi """ make_task = PypeTask( inputs = {"chunk_json": chunk_json_pfn, "dataset": subreadset_pfn, "referenceset": referenceset_pfn, }, outputs = {"alignmentset": alignmentset_pfn, "unmapped": unmapped_pfn, }, parameters = parameters, ) task = make_task(start_task.task_pbalign) tasks.append(task) o_alignmentset_pfn = makePypeLocalFile('run-pbalign_gather/aligned.subreads.alignmentset.xml') o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt') make_task = PypeTask( inputs = gathering, outputs = {"o_ds": o_alignmentset_pfn, "o_unmapped": o_unmapped_pfn, }, parameters = parameters, ) task = make_task(start_task.task_pbalign_gather) tasks.append(task) return tasks, alignmentset_pfn
def task_cns_zcat(self): gathered_p_ctg_fn = fn(self.gathered_p_ctg) gathered_h_ctg_fn = fn(self.gathered_h_ctg) cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta) cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq) cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta) cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq) job_done_fn = fn(self.job_done) rm(cns_p_ctg_fasta_fn) touch(cns_p_ctg_fasta_fn) rm(cns_p_ctg_fastq_fn) touch(cns_p_ctg_fastq_fn) with open(gathered_p_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format(**locals())) system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format(**locals())) # comment out this for now for recovering purpose #with open(gathered_p_ctg_fn) as ifs: # for line in ifs: # cns_fasta_fn, cns_fastq_fn = line.split() # rm(cns_fasta_fn) # rm(cns_fasta_fn) rm(cns_h_ctg_fasta_fn) touch(cns_h_ctg_fasta_fn) rm(cns_h_ctg_fastq_fn) touch(cns_h_ctg_fastq_fn) with open(gathered_h_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format(**locals())) system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format(**locals())) # comment out this for now for recovering purpose #with open(gathered_h_ctg_fn) as ifs: # for line in ifs: # cns_fasta_fn, cns_fastq_fn = line.split() # rm(cns_fasta_fn) # rm(cns_fasta_fn) touch(job_done_fn)
def task_cns_zcat(self): gathered_p_ctg_fn = fn(self.gathered_p_ctg) gathered_h_ctg_fn = fn(self.gathered_h_ctg) cns_p_ctg_fasta_fn = fn(self.cns_p_ctg_fasta) cns_p_ctg_fastq_fn = fn(self.cns_p_ctg_fastq) cns_h_ctg_fasta_fn = fn(self.cns_h_ctg_fasta) cns_h_ctg_fastq_fn = fn(self.cns_h_ctg_fastq) job_done_fn = fn(self.job_done) rm(cns_p_ctg_fasta_fn) touch(cns_p_ctg_fasta_fn) rm(cns_p_ctg_fastq_fn) touch(cns_p_ctg_fastq_fn) with open(gathered_p_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() system('zcat {cns_fasta_fn} >> {cns_p_ctg_fasta_fn}'.format( **locals())) system('zcat {cns_fastq_fn} >> {cns_p_ctg_fastq_fn}'.format( **locals())) with open(gathered_p_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() rm(cns_fasta_fn) rm(cns_fasta_fn) rm(cns_h_ctg_fasta_fn) touch(cns_h_ctg_fasta_fn) rm(cns_h_ctg_fastq_fn) touch(cns_h_ctg_fastq_fn) with open(gathered_h_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() system('zcat {cns_fasta_fn} >> {cns_h_ctg_fasta_fn}'.format( **locals())) system('zcat {cns_fastq_fn} >> {cns_h_ctg_fastq_fn}'.format( **locals())) with open(gathered_h_ctg_fn) as ifs: for line in ifs: cns_fasta_fn, cns_fastq_fn = line.split() rm(cns_fasta_fn) rm(cns_fasta_fn) touch(job_done_fn)
def generate_read_to_ctg_map(self): rawread_id_file = fn( self.rawread_id_file ) pread_id_file = fn( self.pread_id_file ) read_to_contig_map = fn( self.read_to_contig_map ) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") h_ctg_edges = fn( self.h_ctg_edges ) p_ctg_edges = fn( self.p_ctg_edges ) h_ctg_ids = set() with open(fn(self.h_ctg_ids)) as f: for row in f: row = row.strip() h_ctg_ids.add( row ) pread_to_contigs = {} for fnanme in ( p_ctg_edges, h_ctg_edges): with open(fnanme) as f: for row in f: row = row.strip().split() ctg = row[0] if len(ctg.split("_")) > 1 and ctg not in h_ctg_ids: continue n1 = row[1] n2 = row[2] pid1 = int(n1.split(":")[0]) pid2 = int(n2.split(":")[0]) rid1 = pread_did_to_rid[pid1].split("/")[1] rid2 = pread_did_to_rid[pid2].split("/")[1] rid1 = int(int(rid1)/10) rid2 = int(int(rid2)/10) oid1 = rid_to_oid[rid1] oid2 = rid_to_oid[rid2] k1 = (pid1, rid1, oid1) pread_to_contigs.setdefault( k1, set() ) pread_to_contigs[ k1 ].add( ctg ) k2 = (pid2, rid2, oid2) pread_to_contigs.setdefault( k2, set() ) pread_to_contigs[ k2 ].add( ctg ) with open(read_to_contig_map, "w") as f: for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[ k ]): print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)
def generate_read_to_hctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split('\n') rid_to_oid = open(rawread_id_file).read().split('\n') h_ctg_edges = fn(self.h_ctg_edges) p_ctg_edges = fn(self.p_ctg_edges) h_ctg_ids = set() with open(fn(self.h_ctg_ids)) as f: for row in f: row = row.strip() h_ctg_ids.add(row) pread_to_contigs = {} for fnanme in (p_ctg_edges, h_ctg_edges): with open(fnanme) as f: for row in f: row = row.strip().split() ctg = row[0] if len(ctg.split('_')) > 1 and ctg not in h_ctg_ids: continue n1 = row[1] n2 = row[2] pid1 = int(n1.split(':')[0]) pid2 = int(n2.split(':')[0]) rid1 = pread_did_to_rid[pid1].split('/')[1] rid2 = pread_did_to_rid[pid2].split('/')[1] rid1 = int(int(rid1) / 10) rid2 = int(int(rid2) / 10) oid1 = rid_to_oid[rid1] oid2 = rid_to_oid[rid2] k1 = (pid1, rid1, oid1) pread_to_contigs.setdefault(k1, set()) pread_to_contigs[k1].add(ctg) k2 = (pid2, rid2, oid2) pread_to_contigs.setdefault(k2, set()) pread_to_contigs[k2].add(ctg) with open(read_to_contig_map, 'w') as f: for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, '%09d %09d %s %s' % (pid, rid, oid, ctg)
def create_quiver_jobs(wf, scattered_quiver_plf): scattered_quiver_fn = fn(scattered_quiver_plf) jobs = json.loads(open(scattered_quiver_fn).read()) #ctg_ids = sorted(jobs['ref_seq_data']) p_ctg_out=[] h_ctg_out=[] job_done_plfs = {} for job in jobs: ctg_id = job['ctg_id'] ctg_types = job['ctg_types'] smrt_bin = job['smrt_bin'] sge_option = job['sge_option'] ref_fasta = makePypeLocalFile(job['ref_fasta']) read_bam = makePypeLocalFile(job['read_bam']) m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id) #ref_fasta = makePypeLocalFile(os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id = ctg_id))) #read_bam = makePypeLocalFile(os.path.join(os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id = ctg_id))) cns_fasta = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id = ctg_id))) cns_fastq = makePypeLocalFile(os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id = ctg_id))) job_done = makePypeLocalFile(os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id = ctg_id))) if os.path.exists(fn(read_bam)): # TODO(CD): Ask Jason what we should do if missing SAM. if ctg_types[ctg_id] == 'p': p_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) ) elif ctg_types[ctg_id] == 'h': h_ctg_out.append( (fn(cns_fasta), fn(cns_fastq)) ) else: LOG.warning('Type is {!r}, not "p" or "h". Why are we running Quiver?'.format(ctg_types[ctg_id])) parameters = { 'job_uid':'q-'+ctg_id, 'ctg_id': ctg_id, 'smrt_bin': smrt_bin, 'sge_option': sge_option, } make_quiver_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_bam': read_bam, 'scattered_quiver': scattered_quiver_plf, }, outputs = {'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done}, parameters = parameters, ) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) job_done_plfs['{}'.format(ctg_id)] = job_done return p_ctg_out, h_ctg_out, job_done_plfs
def taskrun0(self): template = """ sleep_s=%(sleep_s)s ofile=%(ofile)s set -vex echo start0 sleep ${sleep_s} touch ${ofile} echo end0 """ bash = template %dict( #ifile=fn(self.i0), ofile=fn(self.f0), sleep_s=self.parameters['sleep_s'], ) log.debug('taskrun0 bash:\n' + bash) script = 'taskrun0.sh' with open(script, 'w') as ofs: ofs.write(bash) #system("bash {}".format(script), check=True) #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs. self.generated_script_fn = script return script
def taskrun0(self): template = """ sleep_s=%(sleep_s)s ofile=%(ofile)s set -vex echo start0 sleep ${sleep_s} touch ${ofile} echo end0 """ bash = template % dict( #ifile=fn(self.i0), ofile=fn(self.f0), sleep_s=self.parameters['sleep_s'], ) log.debug('taskrun0 bash:\n' + bash) script = 'taskrun0.sh' with open(script, 'w') as ofs: ofs.write(bash) #system("bash {}".format(script), check=True) #spawn(['/bin/bash', script], check=True) # Beware! Hard to kill procs. self.generated_script_fn = script return script
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters): """Create a gc task for each chunk, plus a gathering task. Here is the convoluted workflow: 1. For each gc instance "chunk": A. variantCaller writes .fasta B. We create a contigset for the .fasta 2. We keep the contigset output filenames in a FOFN (from run_gc_scatter) and pass that to run_gc_gather(). 3. We read each contigset and add them to a gathered ContigSet. 4. We "consolidate" their underlying .fasta "resources", assuming their filenames match except extenion. 5. Finally, we write the gathered contigset. Whew! We also gather fastq here, for convenience. """ tasks = list() contigsets = dict() fastqs = dict() # Assume fofn of gc chunks are all relative to the dir of the fofn. for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()): alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)), alignmentset_bn) wdir = 'run-gc-{:02}'.format(i) mkdirs(wdir) # Assume CWD is correct. alignmentset_pfn = makePypeLocalFile(alignmentset_fn) # New pfn cuz it was not pfn before. polished_fastq_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.fastq')) variants_gff_pfn = makePypeLocalFile(os.path.join(wdir, 'variants.gff')) consensus_contigset_pfn = makePypeLocalFile(os.path.join(wdir, 'consensus.contigset.xml')) """Also produces: consensus.fasta consensus.fasta.fai And note that these files names are important, as pbcoretools gathering expects a particular pattern. """ contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn make_task = PypeTask( inputs = {"alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn,}, outputs = { "polished_fastq": polished_fastq_pfn, "variants_gff": variants_gff_pfn, "consensus_contigset": consensus_contigset_pfn, }, parameters = parameters, ) task = make_task(start_task.task_genomic_consensus) tasks.append(task) contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml') gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq') inputs = dict(contigsets) inputs.update(fastqs) log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets))) make_task = PypeTask( inputs = inputs, outputs = {"ds_out": contigset_pfn, "fastq_out": gathered_fastq_pfn, }, parameters = parameters, ) task = make_task(start_task.task_gc_gather) tasks.append(task) return tasks, contigset_pfn, gathered_fastq_pfn
def run( wf, config, input_config_fn, input_fofn_plf, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath('./0-rawreads') pread_dir = os.path.abspath('./1-preads_ovl') falcon_asm_dir = os.path.abspath('./2-asm-falcon') script_dir = os.path.abspath('./scripts') sge_log_dir = os.path.abspath('./sge_log') for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs wf.max_jobs = config['default_concurrent_jobs'] rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask( inputs={'i_fofn': input_fofn_plf}, outputs={'o_fofn': rawread_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config['input_type'] == 'raw': #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, 'sleep_done')) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, 'rdb_build_done')) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, 'run_jobs.sh')) parameters = { 'work_dir': rawread_dir, 'sge_option': config['sge_option_da'], 'config_fn': input_config_fn, 'config': config } length_cutoff_plf = makePypeLocalFile( os.path.join(rawread_dir, 'length_cutoff')) raw_reads_db_plf = makePypeLocalFile( os.path.join(rawread_dir, '%s.db' % 'raw_reads')) make_build_rdb_task = PypeTask( inputs={'input_fofn': rawread_fofn_plf}, outputs={ 'rdb_build_done': rdb_build_done, 'raw_reads_db': raw_reads_db_plf, 'length_cutoff': length_cutoff_plf, 'run_jobs': run_jobs, }, parameters=parameters, ) build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner wf.max_jobs = config['da_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs={ 'run_jobs_fn': run_jobs, 'db_build_done': rdb_build_done, }, outputs={ 'scatter_fn': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'nblock': raw_reads_nblock, 'pread_aln': False, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, scattered_plf) wf.addTasks(daligner_tasks) r_gathered_las_plf = makePypeLocalFile( os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt')) parameters = { 'nblock': raw_reads_nblock, } make_daligner_gather = PypeTask( inputs=daligner_out, outputs={'gathered': r_gathered_las_plf}, parameters=parameters, ) check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. wf.max_jobs = config['la_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'run_jobs': run_jobs, 'gathered_las': r_gathered_las_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks( rawread_dir, scattered_plf) wf.addTasks(merge_tasks) task, _, las_fopfn_plf = create_merge_gather_task( os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'overlapping': sys.exit(0) # Produce new FOFN of preads fasta, based on consensus of overlaps. wf.max_jobs = config['cns_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'gathered': las_fopfn_plf, 'db': raw_reads_db_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_consensus_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) tasks, consensus_out = create_consensus_tasks(rawread_dir, scattered_plf) wf.addTasks(tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) task, preads_fofn_plf = create_consensus_gather_task( os.path.join(rawread_dir, 'preads'), consensus_out) wf.addTask(task) rdir = os.path.join(rawread_dir, 'report') pre_assembly_report_plf = makePypeLocalFile( os.path.join(rdir, 'pre_assembly_stats.json')) parameters = dict(config) parameters['cwd'] = rdir make_task = PypeTask( inputs={ 'length_cutoff_fn': length_cutoff_plf, 'raw_reads_db': raw_reads_db_plf, 'preads_fofn': preads_fofn_plf, }, outputs={ 'pre_assembly_report': pre_assembly_report_plf, }, parameters=parameters, ) task = make_task(pype_tasks.task_report_pre_assembly) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'pre-assembly': log.info('Quitting after stage-0 for "pre-assembly" target.') sys.exit(0) # build pread database if config['input_type'] == 'preads': preads_fofn_plf = makePypeLocalFile( os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask( inputs={'i_fofn': rawread_fofn_plf}, outputs={'o_fofn': preads_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task( pype_tasks.task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, 'pdb_build_done')) parameters = { 'work_dir': pread_dir, 'sge_option': config['sge_option_pda'], 'config_fn': input_config_fn, 'config': config } run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask( inputs={'preads_fofn': preads_fofn_plf}, outputs={ 'pdb_build_done': pdb_build_done, 'preads_db': preads_db, 'run_jobs': run_jobs, }, parameters=parameters, ) build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner wf.max_jobs = config['pda_concurrent_jobs'] config['sge_option_da'] = config['sge_option_pda'] scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs={ 'run_jobs_fn': run_jobs, 'db_build_done': pdb_build_done, }, outputs={ 'scatter_fn': scattered_plf, }, parameters={ 'db_prefix': 'preads', 'nblock': preads_nblock, 'pread_aln': True, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, scattered_plf) wf.addTasks(daligner_tasks) p_gathered_las_plf = makePypeLocalFile( os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt')) parameters = { 'nblock': preads_nblock, } make_daligner_gather = PypeTask( inputs=daligner_out, outputs={'gathered': p_gathered_las_plf}, parameters=parameters, ) check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. wf.max_jobs = config['pla_concurrent_jobs'] config['sge_option_la'] = config['sge_option_pla'] scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'run_jobs': run_jobs, 'gathered_las': p_gathered_las_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'preads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir, scattered_plf) wf.addTasks(merge_tasks) task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task( os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Draft assembly (called 'fc_' for now) wf.max_jobs = config['fc_concurrent_jobs'] db2falcon_dir = os.path.join(pread_dir, 'db2falcon') db2falcon_done = makePypeLocalFile( os.path.join(db2falcon_dir, 'db2falcon_done')) preads4falcon_plf = makePypeLocalFile( os.path.join(db2falcon_dir, 'preads4falcon.fasta')) make_run_db2falcon = PypeTask( inputs={ 'las_fofn_plf': las_fofn_plf, 'preads_db': preads_db, }, outputs={ 'db2falcon_done': db2falcon_done, 'preads4falcon': preads4falcon_plf, }, parameters={ 'wd': db2falcon_dir, 'config': config, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done')) make_run_falcon_asm = PypeTask( inputs={ 'db2falcon_done': db2falcon_done, 'db_file': preads_db, 'preads4falcon': preads4falcon_plf, 'las_fofn': las_fofn_plf, }, outputs={'falcon_asm_done': falcon_asm_done}, parameters={ 'wd': falcon_asm_dir, 'config': config, 'pread_dir': pread_dir, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
def dump_pread_ids(self): pread_db = fn(self.pread_db) pread_id_file = fn(self.pread_id_file) os.system( "DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (pread_db, pread_id_file))
def flow(config): #import pdb; pdb.set_trace() parameters = config #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs #wf.refreshTargets(exitOnFailure=exitOnFailure) #wf = PypeThreadWorkflow() #wf = PypeWorkflow() #wf = PypeWorkflow(job_type='local') log.debug('config=\n{}'.format(pprint.pformat(config))) # Set some defaults on the Workflow. concurrent_jobs = 24 # TODO: Configure this. wf = PypeWorkflow( job_type=config['hgap'].get('job_type'), job_queue=config['hgap'].get('job_queue'), watcher_type=config['hgap'].get('pwatcher_type', 'blocking'), #watcher_directory=config['pwatcher_directory'], max_jobs=config['hgap'].get('max_jobs', concurrent_jobs), ) use_tmpdir = config['hgap'].get('use_tmpdir') if use_tmpdir: log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir)) if use_tmpdir is not True and '/' in use_tmpdir: tempfile.tempdir = use_tmpdir log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir)) else: log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir)) dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0]) filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml') make_task = PypeTask( inputs={ "dataset": dataset_pfn, }, outputs={ "filtered": filtered_pfn, }, parameters=parameters, ) task = make_task(start_task.task_filterbam) wf.addTask(task) split_subreadsets_fofn_pfn = makePypeLocalFile( 'run-bam_scatter/chunked_subreadsets.fofn') make_task = PypeTask( inputs={ "dataset": filtered_pfn, }, outputs={ "split_subreadsets_fofn": split_subreadsets_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_bam_scatter) wf.addTask(task) wf.refreshTargets() tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg') fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json') make_task = PypeTask( inputs={ "input_fofn": input_fofn_pfn, }, outputs={ "fc_cfg": fc_cfg_pfn, "fc_json_config": fc_json_config_pfn, }, parameters=parameters, ) task = make_task(start_task.task_prepare_falcon) wf.addTask(task) wf.refreshTargets() input_config_fn = fn(fc_cfg_pfn) with sys.cd('run-falcon'): falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger( 'falcon') fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg( falcon_kit.run_support.parse_config(input_config_fn)) # FALCON takes over the workflow for a while. # (For debugging, it is still possible to restart just fc_run, if desired.) falcon_asm_done_pfn = falcon_kit.mains.run1.run( wf, fc_cfg, input_config_fn, input_fofn_plf=input_fofn_pfn, # _pfn should be _plf, but oh well ) wf.max_jobs = concurrent_jobs # in case Falcon changed this # Here is a hard-linking task to help us attach falcon into the dependency graph. falcon_link_done_pfn = makePypeLocalFile( 'run-falcon_link/falcon_link_done') make_task = PypeTask( inputs={ "falcon_asm_done": falcon_asm_done_pfn, }, outputs={ "falcon_link_done": falcon_link_done_pfn, }, parameters=parameters, ) task = make_task(start_task.task_falcon_link) wf.addTask(task) # The rest of the workflow will operate on datasets, not fasta directly. referenceset_pfn = makePypeLocalFile( 'run-fasta2referenceset/asm.referenceset.xml') make_task = PypeTask( inputs={ "falcon_link_done": falcon_link_done_pfn, }, outputs={ "referenceset": referenceset_pfn, }, parameters=parameters, ) task = make_task(start_task.task_fasta2referenceset) wf.addTask(task) wf.refreshTargets() # scatter the subreads for pbalign """Produces: pbalign_chunk.json chunk_subreadset_*.subreadset.xml """ pbalign_chunk_json_pfn = makePypeLocalFile( 'run-pbalign-scatter/pbalign_chunk.json') make_task = PypeTask( inputs={ "dataset": dataset_pfn, "referenceset": referenceset_pfn, }, outputs={ "out_json": pbalign_chunk_json_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign_scatter) wf.addTask(task) wf.refreshTargets() # After scattering, we can specify the pbalign jobs. tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # scatter the alignmentset for genomic_consensus (variantCaller) """Produces: gc.chunks.fofn ???*.congitset.xml ??? """ gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn') make_task = PypeTask( inputs={ "alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn, }, outputs={ "out_fofn": gc_chunks_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_gc_scatter) wf.addTask(task) wf.refreshTargets() tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc( gc_chunks_fofn_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # Final report polished_assembly_report_json_pfn = makePypeLocalFile( 'run-polished-assembly-report/polished_assembly_report.json') make_task = PypeTask( inputs={ "referenceset": referenceset_pfn, "gathered_alignmentset": alignmentset_pfn, "polished_fastq": gathered_fastq_pfn, }, outputs={ "report_json": polished_assembly_report_json_pfn, }, parameters=parameters, ) task = make_task(start_task.task_polished_assembly_report) wf.addTask(task) wf.refreshTargets() par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn)) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png')) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality_thumb.png')) #return ############## if not os.path.exists('foo.bar1'): sys.system('touch foo.bar1') foo_fn1 = makePypeLocalFile('foo.bar1') foo_fn2 = makePypeLocalFile('foo.bar2') make_task = PypeTask( inputs={ "foo1": foo_fn1, }, outputs={ "foo2": foo_fn2, }, parameters=parameters, ) task = make_task(start_task.task_foo) wf.addTask(task) wf.refreshTargets()
def task_gather_quiver(self): """We wrote the "gathered" files during task construction. """ job_done_fn = fn(self.job_done) touch(job_done_fn)
def get_phased_blocks(self): vmap_fn = fn(self.vmap_file) atable_fn = fn(self.atable_file) p_variant_fn = fn(self.phased_variant_file) left_connect = {} right_connect = {} c_score = {} states = {} positions = set() ref_base = {} with open(vmap_fn) as f: for l in f: l = l.strip().split() pos = int(l[0]) ref_b = l[1] v_b = l[2] q_id = int(l[3]) ref_base[pos] = ref_b with open(atable_fn) as f: for l in f: l = l.strip().split() pos1, b11, b12, pos2, b21, b22, s11, s12, s21, s22 = l s11, s12, s21, s22 = int(s11), int(s12), int(s21), int(s22) if abs(s11+s22-s12-s21) < 6: continue pos1 = int(pos1) pos2 = int(pos2) positions.add(pos1) positions.add(pos2) right_connect.setdefault(pos1, []) right_connect[pos1].append(pos2) left_connect.setdefault(pos2, []) left_connect[pos2].append(pos1) c_score[ (pos1, pos2) ] = { (b11+b21, b12+b22): s11 + s22, (b12+b22, b11+b21): s11 + s22, (b12+b21, b11+b22): s12 + s21, (b11+b22, b12+b21): s12 + s21 } if pos1 not in states: st1 = (b11, b12) st2 = (b12, b11) score1 = 0 score2 = 0 for pp in left_connect.get(pos1,[]): if pp in states: st0 = states[pp] else: continue score1 += get_score( c_score, pp, pos1, st0, st1 ) score2 += get_score( c_score, pp, pos1, st0, st2 ) for pp in right_connect.get(pos1,[]): if pp in states: st0 = states[pp] else: continue score1 += get_score( c_score, pos1, pp, st1, st0 ) score2 += get_score( c_score, pos1, pp, st2, st0 ) if score1 >= score2: states[pos1] = st1 else: states[pos1] = st2 if pos2 not in states: st1 = (b21, b22) st2 = (b22, b21) score1 = 0 score2 = 0 for pp in left_connect.get(pos2,[]): if pp in states: st0 = states[pp] else: continue score1 += get_score( c_score, pp, pos2, st0, st1 ) score2 += get_score( c_score, pp, pos2, st0, st2 ) for pp in right_connect.get(pos2,[]): if pp in states: st0 = states[pp] else: continue score1 += get_score( c_score, pos2, pp, st1, st0 ) score2 += get_score( c_score, pos2, pp, st2, st0 ) if score1 >= score2: states[pos2] = st1 else: states[pos2] = st2 positions = list(positions) positions.sort() iter_count = 0 while 1: iter_count += 1 if iter_count > 10: break update_count = 0 for p in positions: b1, b2 = states[p] st1 = (b1, b2) st2 = (b2, b1) score1 = 0 score2 = 0 for pp in left_connect.get(p,[]): st0 = states[pp] score1 += get_score( c_score, pp, p, st0 ,st1) score2 += get_score( c_score, pp, p, st0, st2) #for pp in right_connect.get(p,[]): # st0 = states[pp] # score1 += get_score( c_score, p, pp, st1 ,st0) # score2 += get_score( c_score, p, pp, st2, st0) if score1 >= score2: states[p] = st1 else: states[p] = st2 update_count += 1 if update_count == 0: break right_extent = {} right_score = {} left_extent = {} left_score = {} for p in positions: left_extent[p] = p left_score[p] = 0 if p in left_connect: left = p st0 = states[p] st0_ = st0[1], st0[0] for pp in left_connect[p]: st1 = states[pp] s = get_score( c_score, pp, p, st1, st0) s_ = get_score( c_score, pp, p, st1, st0_) left_score[p] += s - s_ if s - s_ > 0 and pp < left: left = pp left_extent[p] = left right_extent[p] = p right_score[p] = 0 if p in right_connect: right = p st0 = states[p] st0_ = st0[1], st0[0] for pp in right_connect[p]: st1 = states[pp] s = get_score( c_score, p, pp, st0, st1) s_ = get_score( c_score, p, pp, st0_, st1) right_score[p] += s - s_ if s - s_ > 0 and pp > right: right = pp right_extent[p] = right phase_block_id = 1 phase_blocks = {} pb = [] max_right_ext = 0 for p in positions: if right_score[p] < 10 or left_score[p] < 10: continue b1, b2 = states[p] if max_right_ext < left_extent[p]: if len(pb) > 3: phase_blocks[phase_block_id] = pb phase_block_id += 1 pb = [] pb.append( (p, b1, b2) ) if right_extent[p] > max_right_ext: max_right_ext = right_extent[p] if len(pb) > 3: phase_blocks[phase_block_id] = pb else: phase_block_id -= 1 with open(p_variant_fn, "w") as out_f: for pid in xrange(1, phase_block_id+1): if len(phase_blocks[pid]) == 0: continue min_ = min( [x[0] for x in phase_blocks[pid]] ) max_ = max( [x[0] for x in phase_blocks[pid]] ) print >>out_f, "P", pid, min_, max_, max_ - min_, len(phase_blocks[pid]), 1.0 * (max_-min_)/len(phase_blocks[pid]) for p, b1, b2 in phase_blocks[pid]: rb = ref_base[p] print >>out_f, "V", pid, p, "%d_%s_%s" % (p,rb,b1), "%d_%s_%s" % (p,rb,b2), left_extent[p], right_extent[p], left_score[p], right_score[p]
def generate_association_table(self): vmap_fn = fn(self.vmap_file) atable_fn = fn(self.atable_file) ctg_id = self.parameters["ctg_id"] base_dir = self.parameters["base_dir"] vmap = {} v_positions = [] with open(vmap_fn) as f: for l in f: l = l.strip().split() pos = int(l[0]) ref_b = l[1] v_b = l[2] q_id = int(l[3]) if (pos, ref_b) not in vmap: v_positions.append( (pos, ref_b) ) vmap.setdefault( (pos, ref_b), {} ) vmap[ (pos, ref_b) ].setdefault(v_b, []) vmap[ (pos, ref_b) ][v_b].append( q_id ) #xary = [] #yary = [] with open(atable_fn, "w") as out_f: for i1 in xrange(len(v_positions)): link_count = 0 for i2 in xrange(i1+1, len(v_positions)): pos1, rb1 = v_positions[i1] pos2, rb2 = v_positions[i2] if pos2 - pos1 > (1 << 16): continue ct = {} p1table = [] p2table = [] s1 = 0 list1 = vmap[ (pos1, rb1) ].items() for b1, qids1 in list1: p1table.append( (b1, len(qids1) ) ) s1 += len(qids1) s2 = 0 list2 = vmap[ (pos2, rb2) ].items() for b2, qids2 in list2: p2table.append( (b2, len(qids2) ) ) s2 += len(qids2) total_s = 0 for b1, qids1 in list1: for b2, qids2 in list2: s = len(set(qids1) & set(qids2)) ct[(b1,b2)] = s total_s += s if total_s < 6: continue b11 = p1table[0][0] b12 = p1table[1][0] b21 = p2table[0][0] b22 = p2table[1][0] print >> out_f, pos1, b11, b12, pos2, b21, b22, ct[(b11,b21)], ct[(b11,b22)], ct[(b12,b21)], ct[(b12,b22)] #xary.append(pos1) #yary.append(pos2) link_count += 1 if link_count > 500: break
def make_het_call(self): bam_fn = fn(self.bam_file) ctg_id = self.parameters["ctg_id"] ref_seq = self.parameters["ref_seq"] base_dir = self.parameters["base_dir"] samtools = self.parameters["samtools"] vmap_fn = fn(self.vmap_file) vpos_fn = fn(self.vpos_file) q_id_map_fn = fn(self.q_id_map_file) # maybe we should check if the samtools path is valid p = subprocess.Popen(shlex.split("%s view %s %s" % (samtools, bam_fn, ctg_id) ), stdout=subprocess.PIPE) pileup = {} q_id_map = {} q_max_id = 0 q_id = 0 q_name_to_id = {} try: os.makedirs("%s/%s" % (base_dir, ctg_id)) except OSError: pass vmap = open(vmap_fn, "w") vpos = open(vpos_fn, "w") for l in p.stdout: l = l.strip().split() if l[0][0] == "@": continue QNAME = l[0] if QNAME not in q_name_to_id: q_id = q_max_id q_name_to_id[QNAME] = q_id q_max_id += 1 q_id = q_name_to_id[QNAME] q_id_map[q_id] = QNAME FLAG = int(l[1]) RNAME = l[2] POS = int(l[3]) - 1 # convert to zero base CIGAR = l[5] SEQ = l[9] rp = POS qp = 0 skip_base = 0 total_aln_pos = 0 for m in re.finditer(cigar_re, CIGAR): adv = int(m.group(1)) total_aln_pos += adv if m.group(2) == "S": skip_base += adv if 1.0 - 1.0 * skip_base / total_aln_pos < 0.1: continue if total_aln_pos < 2000: continue for m in re.finditer(cigar_re, CIGAR): adv = int(m.group(1)) if m.group(2) == "S": qp += adv if m.group(2) in ("M", "=", "X"): matches = [] for i in range(adv): matches.append( (rp, SEQ[qp]) ) rp += 1 qp += 1 for pos, b in matches: pileup.setdefault(pos, {}) pileup[pos].setdefault(b, []) pileup[pos][b].append(q_id) elif m.group(2) == "I": for i in range(adv): qp += 1 elif m.group(2) == "D": for i in range(adv): rp += 1 pos_k = pileup.keys() pos_k.sort() th = 0.25 for pos in pos_k: if pos < POS: if len(pileup[pos]) < 2: del pileup[pos] continue base_count = [] total_count = 0 for b in ["A", "C", "G", "T"]: count = len(pileup[pos].get(b,[])) base_count.append( (count, b) ) total_count += count if total_count < 10: del pileup[pos] continue base_count.sort() base_count.reverse() p0 = 1.0 * base_count[0][0] / total_count p1 = 1.0 * base_count[1][0] / total_count if p0 < 1.0 - th and p1 > th: b0 = base_count[0][1] b1 = base_count[1][1] ref_base = ref_seq[pos] print >> vpos, pos+1, ref_base, total_count, " ".join(["%s %d" % (x[1], x[0]) for x in base_count]) for q_id_ in pileup[pos][b0]: print >> vmap, pos+1, ref_base, b0, q_id_ for q_id_ in pileup[pos][b1]: print >> vmap, pos+1, ref_base, b1, q_id_ del pileup[pos] q_id_map_f = open(q_id_map_fn, "w") for q_id, q_name in q_id_map.items(): print >> q_id_map_f, q_id, q_name
def dump_pread_to_ctg(self): pread_db = fn( self.pread_db ) rawread_id_file = fn( self.rawread_id_file ) pread_id_file = fn( self.pread_id_file ) phased_read_file = fn( self.phased_reads) read_to_contig_map = fn( self.read_to_contig_map ) las_file = fn( self.las_file ) pread_to_contig_file = fn( self.pread_to_contig_file ) read_to_contig_map = fn( self.read_to_contig_map ) pid_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None pid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row pid = int(pid) pid_to_contigs.setdefault( pid, (oid, set() ) ) pid_to_contigs[ pid ][1].add( ctg ) oid_to_phase = {} with open(phased_read_file) as f: for row in f: row = row.strip().split() ctg_id, block, phase = row[1:4] oid = row[6] block = int(block) phase = int(phase) oid_to_phase[ oid ] = (ctg_id, block, phase) with open(pread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None skip_rest = 0 for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file)) ).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: rid = pid_to_rid[cur_read_id].split("/")[1] rid = int(int(rid)/10) o_id = rid_to_oid[ rid ] print >>f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id skip_rest = 0 if q_id in pid_to_contigs and len(ovlp_data) == 0: #if the query is in some contig.... t_o_id, ctgs = pid_to_contigs[ q_id ] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid)/10) o_id = rid_to_oid[ rid ] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 skip_rest = 1 if skip_rest == 1: continue if t_id not in pid_to_contigs: continue q_rid = int( int(pid_to_rid[q_id].split("/")[1])/10 ) q_phase = oid_to_phase.get( rid_to_oid[ q_rid ], None ) if q_phase != None: ctg_id, block, phase = q_phase if block != -1: t_rid = int( int(pid_to_rid[t_id].split("/")[1])/10 ) t_phase = oid_to_phase.get( rid_to_oid[ t_rid ], None ) if t_phase != None: if t_phase[0] == ctg_id and t_phase[1] == block and t_phase[2] != phase: continue t_o_id, ctgs = pid_to_contigs[ t_id ] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid)/10) o_id = rid_to_oid[ rid ] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1
def flow(config): #import pdb; pdb.set_trace() parameters = config #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs #wf.refreshTargets(exitOnFailure=exitOnFailure) #wf = PypeThreadWorkflow() #wf = PypeWorkflow() #wf = PypeWorkflow(job_type='local') log.debug('config=\n{}'.format(pprint.pformat(config))) # Set some defaults on the Workflow. concurrent_jobs = 24 # TODO: Configure this. wf = PypeWorkflow( job_type=config['hgap'].get('job_type'), job_queue=config['hgap'].get('job_queue'), watcher_type=config['hgap'].get('pwatcher_type', 'blocking'), #watcher_directory=config['pwatcher_directory'], max_jobs=config['hgap'].get('max_jobs', concurrent_jobs), ) use_tmpdir = config['hgap'].get('use_tmpdir') if use_tmpdir: log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir)) if use_tmpdir is not True and '/' in use_tmpdir: tempfile.tempdir = use_tmpdir log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir)) else: log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir)) dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0]) filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml') make_task = PypeTask( inputs = {"dataset": dataset_pfn, }, outputs = {"filtered": filtered_pfn, }, parameters = parameters, ) task = make_task(start_task.task_filterbam) wf.addTask(task) split_subreadsets_fofn_pfn = makePypeLocalFile('run-bam_scatter/chunked_subreadsets.fofn') make_task = PypeTask( inputs = {"dataset": filtered_pfn, }, outputs = {"split_subreadsets_fofn": split_subreadsets_fofn_pfn, }, parameters = parameters, ) task = make_task(start_task.task_bam_scatter) wf.addTask(task) wf.refreshTargets() tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg') fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json') make_task = PypeTask( inputs = { "input_fofn": input_fofn_pfn, }, outputs = {"fc_cfg": fc_cfg_pfn, "fc_json_config": fc_json_config_pfn, }, parameters = parameters, ) task = make_task(start_task.task_prepare_falcon) wf.addTask(task) wf.refreshTargets() input_config_fn = fn(fc_cfg_pfn) with sys.cd('run-falcon'): falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger('falcon') fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg( falcon_kit.run_support.parse_config(input_config_fn)) # FALCON takes over the workflow for a while. # (For debugging, it is still possible to restart just fc_run, if desired.) falcon_asm_done_pfn = falcon_kit.mains.run1.run(wf, fc_cfg, input_config_fn, input_fofn_plf=input_fofn_pfn, # _pfn should be _plf, but oh well ) wf.max_jobs = concurrent_jobs # in case Falcon changed this # Here is a hard-linking task to help us attach falcon into the dependency graph. falcon_link_done_pfn = makePypeLocalFile('run-falcon_link/falcon_link_done') make_task = PypeTask( inputs = {"falcon_asm_done": falcon_asm_done_pfn,}, outputs = { "falcon_link_done": falcon_link_done_pfn, }, parameters = parameters, ) task = make_task(start_task.task_falcon_link) wf.addTask(task) # The rest of the workflow will operate on datasets, not fasta directly. referenceset_pfn = makePypeLocalFile('run-fasta2referenceset/asm.referenceset.xml') make_task = PypeTask( inputs = {"falcon_link_done": falcon_link_done_pfn,}, outputs = {"referenceset": referenceset_pfn,}, parameters = parameters, ) task = make_task(start_task.task_fasta2referenceset) wf.addTask(task) wf.refreshTargets() # scatter the subreads for pbalign """Produces: pbalign_chunk.json chunk_subreadset_*.subreadset.xml """ pbalign_chunk_json_pfn = makePypeLocalFile('run-pbalign-scatter/pbalign_chunk.json') make_task = PypeTask( inputs = {"dataset": dataset_pfn, "referenceset": referenceset_pfn,}, outputs = {"out_json": pbalign_chunk_json_pfn,}, parameters = parameters, ) task = make_task(start_task.task_pbalign_scatter) wf.addTask(task) wf.refreshTargets() # After scattering, we can specify the pbalign jobs. tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # scatter the alignmentset for genomic_consensus (variantCaller) """Produces: gc.chunks.fofn ???*.congitset.xml ??? """ gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn') make_task = PypeTask( inputs = {"alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn,}, outputs = {"out_fofn": gc_chunks_fofn_pfn,}, parameters = parameters, ) task = make_task(start_task.task_gc_scatter) wf.addTask(task) wf.refreshTargets() tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(gc_chunks_fofn_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # Final report polished_assembly_report_json_pfn = makePypeLocalFile('run-polished-assembly-report/polished_assembly_report.json') make_task = PypeTask( inputs = {"referenceset": referenceset_pfn, "gathered_alignmentset": alignmentset_pfn, "polished_fastq": gathered_fastq_pfn,}, outputs = {"report_json": polished_assembly_report_json_pfn,}, parameters = parameters, ) task = make_task(start_task.task_polished_assembly_report) wf.addTask(task) wf.refreshTargets() par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn)) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png')) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality_thumb.png')) #return ############## if not os.path.exists('foo.bar1'): sys.system('touch foo.bar1') foo_fn1 = makePypeLocalFile('foo.bar1') foo_fn2 = makePypeLocalFile('foo.bar2') make_task = PypeTask( inputs = {"foo1": foo_fn1,}, outputs = {"foo2": foo_fn2,}, parameters = parameters, ) task = make_task(start_task.task_foo) wf.addTask(task) wf.refreshTargets()
def run(wf, config, rule_writer, config_fn, input_fofn_plf, ): """ Preconditions (for now): * LOG * run_support.logger """ parsed_config = io.deserialize(config_fn) if parsed_config != config: msg = 'Config from {!r} != passed config'.format(config_fn) raise Exception(msg) general_config = config['General'] general_config_fn = os.path.join(os.path.dirname(config_fn), 'General_config.json') io.serialize(general_config_fn, general_config) # Some tasks use this. rawread_dir = '0-rawreads' pread_dir = '1-preads_ovl' falcon_asm_dir = '2-asm-falcon' for d in (rawread_dir, pread_dir, falcon_asm_dir): support.make_dirs(d) # only matter for parallel jobs job_defaults = config['job.defaults'] exitOnFailure = bool(job_defaults.get('stop_all_jobs_on_failure', False)) default_njobs = int(job_defaults.get('njobs', 7)) wf.max_jobs = default_njobs assert general_config['input_type'] in ( 'raw', 'preads'), 'Invalid input_type=={!r}'.format(general_config['input_type']) # Store config as JSON, available to many tasks. if general_config['input_type'] == 'raw': parameters = {} # import sequences into daligner DB # calculate length_cutoff (if specified as -1) # split DB # run DBdust r_db_dust_fn = os.path.join(rawread_dir, 'build', 'raw_reads.db') length_cutoff_fn = os.path.join(rawread_dir, 'build', 'length_cutoff') wf.addTask(gen_task( script=pype_tasks.TASK_DB_BUILD_SCRIPT, inputs={ 'config': general_config_fn, 'input_fofn': fn(input_fofn_plf), }, outputs={ 'length_cutoff': length_cutoff_fn, 'db': r_db_dust_fn, # Also .raw_reads.*, of course. And dust track. }, parameters=dict( ), rule_writer=rule_writer, dist=Dist(NPROC=1), )) # run TANmask tan_uows_fn = os.path.join( rawread_dir, 'tan-split', 'tan-uows.json') tan_bash_template_fn = os.path.join( rawread_dir, 'tan-split', 'bash_template.sh') wf.addTask(gen_task( script=pype_tasks.TASK_DB_TAN_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_dust_fn, }, outputs={ 'split': tan_uows_fn, 'bash_template': tan_bash_template_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(NPROC=1), )) gathered_fn = os.path.join(rawread_dir, 'tan-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, tan_uows_fn, gathered_fn, run_dict=dict( bash_template_fn=tan_bash_template_fn, script='fubar-TODO', #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': '0-rawreads/tan-chunks/{tan0_id}/some-units-of-work.json', }, outputs={ #'job_done': '0-rawreads/{dal0_id}/daligner.done', 'results': '0-rawreads/tan-runs/{tan0_id}/some-done-files.json', }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']), ) r_db_tan_fn = os.path.join(rawread_dir, 'tan-combine', 'raw_reads.db') wf.addTask(gen_task( script=pype_tasks.TASK_DB_TAN_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_dust_fn, 'gathered': gathered_fn, }, outputs={ 'new_db': r_db_tan_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) # run daligner wf.max_jobs = config['job.step.da'].get('njobs', default_njobs) #rawreads_db_fn = os.path.join(rawread_dir, 'raw_reads.db') daligner_all_units_fn = os.path.join( rawread_dir, 'daligner-split', 'all-units-of-work.json') daligner_bash_template_fn = os.path.join( rawread_dir, 'daligner-split', 'daligner_bash_template.sh') params = dict(parameters) #params['db_prefix'] = 'raw_reads' #params['pread_aln'] = 0 params['skip_checks'] = int(general_config.get('skip_checks', 0)) params['wildcards'] = 'dal0_id' wf.addTask(gen_task( script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_tan_fn, 'length_cutoff': length_cutoff_fn, }, outputs={ 'split': daligner_all_units_fn, 'bash_template': daligner_bash_template_fn }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max )) gathered_fn = os.path.join(rawread_dir, 'daligner-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, daligner_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=daligner_bash_template_fn, script=pype_tasks.TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': os.path.join(rawread_dir, 'daligner-chunks/{dal0_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join(rawread_dir, 'daligner-runs/{dal0_id}/some-done-files.json'), }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']), ) r_gathered_las_fn = os.path.join(rawread_dir, 'daligner-combine', 'gathered-las.json') wf.addTask(gen_task( script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_tan_fn, 'gathered': gathered_fn, }, outputs={ 'las_paths': r_gathered_las_fn, }, parameters={}, rule_writer=rule_writer, #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.da']) dist=Dist(local=True), )) # Merge .las files. wf.max_jobs = config['job.step.la'].get('njobs', default_njobs) las_merge_all_units_fn = os.path.join(rawread_dir, 'las-merge-split', 'all-units-of-work.json') bash_template_fn = os.path.join(rawread_dir, 'las-merge-split', 'las-merge-bash-template.sh') params = dict(parameters) params['db_prefix'] = 'raw_reads' params['wildcards'] = 'mer0_id' wf.addTask(gen_task( script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'las_paths': r_gathered_las_fn, }, outputs={ 'split': las_merge_all_units_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(rawread_dir, 'las-merge-gathered', 'gathered.json') gen_parallel_tasks( wf, rule_writer, las_merge_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake inputs={ #'las_paths': './0-rawreads/merge-scripts/{mer0_id}/las_paths.json', #'merge_script': './0-rawreads/merge-scripts/{mer0_id}/merge-script.sh', #'merged_las_json': './0-rawreads/merge-scripts/{mer0_id}/merged_las.json', 'units_of_work': '0-rawreads/las-merge-chunks/{mer0_id}/some-units-of-work.json', }, outputs={ #'merged_las': './0-rawreads/{mer0_id}/merged.las', #'job_done': './0-rawreads/{mer0_id}/merge.done', 'results': '0-rawreads/las-merge-runs/{mer0_id}/some-las-paths.json', }, parameters={}, ), dist=Dist(NPROC=1, job_dict=config['job.step.la']), ) p_id2las_fn = os.path.join(rawread_dir, 'las-merge-combine', 'p_id2las.json') las_fofn_fn = os.path.join(rawread_dir, 'las-merge-combine', 'las_fofn.json') wf.addTask(gen_task( script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'gathered': gathered_fn, }, outputs={ 'block2las': p_id2las_fn, 'las_paths': las_fofn_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) if general_config['target'] == 'overlapping': sys.exit(0) # Produce new FOFN of preads fasta, based on consensus of overlaps. wf.max_jobs = config['job.step.cns'].get('njobs', default_njobs) split_fn = os.path.join( rawread_dir, 'cns-split', 'split.json') bash_template_fn = os.path.join( rawread_dir, 'cns-split', 'consensus-bash-template.sh') params = dict(parameters) params['wildcards'] = 'cns0_id,cns0_id2' wf.addTask(gen_task( script=pype_tasks.TASK_CONSENSUS_SPLIT_SCRIPT, inputs={ 'p_id2las': p_id2las_fn, 'raw_reads_db': r_db_tan_fn, 'length_cutoff': length_cutoff_fn, 'config': general_config_fn, }, outputs={ 'split': split_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(rawread_dir, 'cns-gather', 'gathered.json') gen_parallel_tasks( wf, rule_writer, split_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks.TASK_CONSENSUS_TASK_SCRIPT, # for snakemake only inputs = { #'las': '0-rawreads/cns-split/{cns0_id}/merged.{cns0_id2}.las', #'db': r_db_tan_fn, #'length_cutoff': length_cutoff_fn, #'config': general_config_fn, 'units_of_work': '0-rawreads/cns-chunks/{cns0_id}/some-units-of-work.json', }, outputs = { #'fasta': '0-rawreads/consensus/{cns0_id}/consensus.{cns0_id2}.fasta', 'results': '0-rawreads/cns-runs/{cns0_id}/some-done-files.json', }, parameters={}, ), dist=Dist(NPROC=6, job_dict=config['job.step.cns']), ) preads_fofn_fn = os.path.join(rawread_dir, 'preads', 'input_preads.fofn') wf.addTask(gen_task( script=pype_tasks.TASK_CONSENSUS_GATHER_SCRIPT, inputs={ 'gathered': gathered_fn, }, outputs={ 'preads_fofn': preads_fofn_fn, }, parameters=parameters, #{}, rule_writer=rule_writer, dist=Dist(local=True), )) rdir = os.path.join(rawread_dir, 'report') pre_assembly_report_fn = os.path.join(rdir, 'pre_assembly_stats.json') params = dict(parameters) params['length_cutoff_user'] = general_config['length_cutoff'] params['genome_length'] = general_config['genome_size'] # note different name; historical wf.addTask(gen_task( script=pype_tasks.TASK_REPORT_PRE_ASSEMBLY_SCRIPT, inputs={'length_cutoff': length_cutoff_fn, 'raw_reads_db': r_db_tan_fn, 'preads_fofn': preads_fofn_fn, 'config': general_config_fn, }, outputs={'pre_assembly_report': pre_assembly_report_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) if general_config['target'] == 'pre-assembly': LOG.info('Quitting after stage-0 for "pre-assembly" target.') sys.exit(0) # build pread database if general_config['input_type'] == 'preads': """ preads_fofn_plf = makePypeLocalFile(os.path.join( pread_dir, 'preads-fofn-abs', os.path.basename(general_config['input_fofn']))) make_fofn_abs_task = PypeTask(inputs={'i_fofn': input_fofn_plf}, outputs={'o_fofn': preads_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task( pype_tasks.task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) """ raise Exception('TODO') pdb_build_done = os.path.join(pread_dir, 'pdb_build_done') run_jobs_fn = os.path.join(pread_dir, 'run_jobs.sh') preads_db_fn = os.path.join(pread_dir, 'build', 'preads.db') length_cutoff_pr_fn = os.path.join(pread_dir, 'build', 'length_cutoff') wf.addTask(gen_task( script=pype_tasks.TASK_DB_BUILD_SCRIPT, inputs={ 'config': general_config_fn, 'input_fofn': preads_fofn_fn, }, outputs={ 'length_cutoff': length_cutoff_pr_fn, 'db': preads_db_fn, # Also .preads.*, of course. }, parameters=dict( ), rule_writer=rule_writer, dist=Dist(NPROC=1), )) # run daligner wf.max_jobs = config['job.step.pda'].get('njobs', default_njobs) daligner_all_units_fn = os.path.join( pread_dir, 'daligner-split', 'all-units-of-work.json') daligner_bash_template_fn = os.path.join( pread_dir, 'daligner-split', 'daligner_bash_template.sh') params = dict(parameters) params['skip_checks'] = int(general_config.get('skip_checks', 0)) params['wildcards'] = 'dal1_id' wf.addTask(gen_task( script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': preads_db_fn, #not tan, yet 'length_cutoff': length_cutoff_pr_fn, }, outputs={ 'split': daligner_all_units_fn, 'bash_template': daligner_bash_template_fn }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max )) gathered_fn = os.path.join(pread_dir, 'daligner-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, daligner_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=daligner_bash_template_fn, script=pype_tasks.TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': os.path.join(pread_dir, 'daligner-chunks/{dal1_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join(pread_dir, 'daligner-runs/{dal1_id}/some-done-files.json'), }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.pda']), ) gathered_las_fn = os.path.join(pread_dir, 'daligner-combine', 'gathered-las.json') wf.addTask(gen_task( script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': preads_db_fn, #r_db_tan_fn, 'gathered': gathered_fn, }, outputs={ 'las_paths': gathered_las_fn, }, parameters={}, rule_writer=rule_writer, #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.pda']) dist=Dist(local=True), )) # Merge .las files. wf.max_jobs = config['job.step.pla'].get('njobs', default_njobs) las_merge_all_units_fn = os.path.join(pread_dir, 'las-merge-split', 'all-units-of-work.json') bash_template_fn = os.path.join(pread_dir, 'las-merge-split', 'las-merge-bash-template.sh') params = dict(parameters) params['db_prefix'] = 'preads' params['wildcards'] = 'mer1_id' wf.addTask(gen_task( script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'las_paths': gathered_las_fn, }, outputs={ 'split': las_merge_all_units_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(pread_dir, 'las-merge-gathered', 'gathered.json') gen_parallel_tasks( wf, rule_writer, las_merge_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake inputs={ 'units_of_work': os.path.join(pread_dir, 'las-merge-chunks/{mer0_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join(pread_dir, 'las-merge-runs/{mer0_id}/some-las-paths.json'), }, parameters={}, ), dist=Dist(NPROC=1, job_dict=config['job.step.la']), ) p_id2las_fn = os.path.join(pread_dir, 'las-merge-combine', 'block2las.json') las_fofn_fn = os.path.join(pread_dir, 'las-merge-combine', 'las_fofn.json') wf.addTask(gen_task( script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'gathered': gathered_fn, }, outputs={ 'block2las': p_id2las_fn, 'las_paths': las_fofn_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) wf.max_jobs = config['job.step.asm'].get('njobs', default_njobs) db2falcon_dir = os.path.join(pread_dir, 'db2falcon') db2falcon_done_fn = os.path.join(db2falcon_dir, 'db2falcon_done') preads4falcon_fn = os.path.join(db2falcon_dir, 'preads4falcon.fasta') wf.addTask(gen_task( script=pype_tasks.TASK_RUN_DB_TO_FALCON_SCRIPT, inputs={'p_id2las': p_id2las_fn, 'preads_db': preads_db_fn, }, outputs={'job_done': db2falcon_done_fn, 'preads4falcon': preads4falcon_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(NPROC=4, job_dict=config['job.step.asm']), )) falcon_asm_done_fn = os.path.join(falcon_asm_dir, 'falcon_asm_done') for key in ('overlap_filtering_setting', 'length_cutoff_pr', 'fc_ovlp_to_graph_option'): parameters[key] = general_config[key] wf.addTask(gen_task( script=pype_tasks.TASK_RUN_FALCON_ASM_SCRIPT, inputs={'db2falcon_done': db2falcon_done_fn, 'db_file': preads_db_fn, 'preads4falcon_fasta': preads4falcon_fn, 'las_fofn': las_fofn_fn, 'config': general_config_fn, }, outputs={'falcon_asm_done': falcon_asm_done_fn}, parameters=parameters, rule_writer=rule_writer, dist=Dist(NPROC=4, job_dict=config['job.step.asm']), )) wf.refreshTargets() with io.cd('0-rawreads'): # for backwards-compatibility io.symlink('las-merge-combine', 'las-gather')
def dump_pread_ids(self): pread_db = fn( self.pread_db ) pread_id_file = fn( self.pread_id_file ) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file) )
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters): """Create a gc task for each chunk, plus a gathering task. Here is the convoluted workflow: 1. For each gc instance "chunk": A. variantCaller writes .fasta B. We create a contigset for the .fasta 2. We keep the contigset output filenames in a FOFN (from run_gc_scatter) and pass that to run_gc_gather(). 3. We read each contigset and add them to a gathered ContigSet. 4. We "consolidate" their underlying .fasta "resources", assuming their filenames match except extenion. 5. Finally, we write the gathered contigset. Whew! We also gather fastq here, for convenience. """ tasks = list() contigsets = dict() fastqs = dict() # Assume fofn of gc chunks are all relative to the dir of the fofn. for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()): alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)), alignmentset_bn) wdir = 'run-gc-{:02}'.format(i) mkdirs(wdir) # Assume CWD is correct. alignmentset_pfn = makePypeLocalFile( alignmentset_fn) # New pfn cuz it was not pfn before. polished_fastq_pfn = makePypeLocalFile( os.path.join(wdir, 'consensus.fastq')) variants_gff_pfn = makePypeLocalFile(os.path.join( wdir, 'variants.gff')) consensus_contigset_pfn = makePypeLocalFile( os.path.join(wdir, 'consensus.contigset.xml')) """Also produces: consensus.fasta consensus.fasta.fai And note that these files names are important, as pbcoretools gathering expects a particular pattern. """ contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn make_task = PypeTask( inputs={ "alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn, }, outputs={ "polished_fastq": polished_fastq_pfn, "variants_gff": variants_gff_pfn, "consensus_contigset": consensus_contigset_pfn, }, parameters=parameters, ) task = make_task(start_task.task_genomic_consensus) tasks.append(task) contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml') gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq') inputs = dict(contigsets) inputs.update(fastqs) log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets))) make_task = PypeTask( inputs=inputs, outputs={ "ds_out": contigset_pfn, "fastq_out": gathered_fastq_pfn, }, parameters=parameters, ) task = make_task(start_task.task_gc_gather) tasks.append(task) return tasks, contigset_pfn, gathered_fastq_pfn
def main(argv=sys.argv): global LOG LOG = support.setup_logger(None) if len(sys.argv) < 2: print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment' sys.exit(1) config_fn = sys.argv[1] config_absbasedir = os.path.dirname(os.path.abspath(config_fn)) config = ConfigParser.ConfigParser() config.read(config_fn) job_type = 'SGE' if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = ' -pe smp 12 -q bigmem' if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = ' -pe smp 24 -q bigmem ' if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/' if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = 'input_bam.fofn' if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') if not os.path.isabs(input_bam_fofn): input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn) quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { 'job_type': job_type, 'sge_quiver': sge_quiver, 'sge_track_reads': sge_track_reads, 'input_bam_fofn': input_bam_fofn, 'smrt_bin': smrt_bin } LOG.info('config={}'.format(pprint.pformat(config))) #support.job_type = 'SGE' #tmp hack until we have a configuration parser wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, ) abscwd = os.path.abspath('.') parameters = { 'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config } hasm_done_plf = makePypeLocalFile( './3-unzip/1-hasm/hasm_done') # by convention track_reads_h_done_plf = makePypeLocalFile( os.path.join(parameters['wd'], 'track_reads_h_done')) make_track_reads_task = PypeTask( inputs={'hasm_done': hasm_done_plf}, outputs={'job_done': track_reads_h_done_plf}, parameters=parameters, ) track_reads_task = make_track_reads_task(task_track_reads) #sge_track_reads = config['sge_track_reads'] wf.addTask(track_reads_task) scattered_quiver_plf = makePypeLocalFile( '4-quiver/quiver_scatter/scattered.json') make_task = PypeTask( inputs={ 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'), 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'), 'track_reads_h_done': track_reads_h_done_plf, }, outputs={ 'scattered_quiver_json': scattered_quiver_plf, }, parameters={}, ) wf.addTask(make_task(task_scatter_quiver)) wf.refreshTargets() p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs( scattered_quiver_plf) gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt') gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt') gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done') mkdir('4-quiver/cns_gather') with open(fn(gathered_p_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) with open(fn(gathered_h_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) make_task = PypeTask( inputs=job_done_plfs, outputs={ 'job_done': gather_done_plf, }, parameters={}, ) wf.addTask(make_task(task_gather_quiver)) wf.refreshTargets() cns_p_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fasta') cns_p_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fastq') cns_h_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fasta') cns_h_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fastq') zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done') make_task = PypeTask( inputs={ 'gathered_p_ctg': gathered_p_ctg_plf, 'gathered_h_ctg': gathered_h_ctg_plf, 'gather_done': gather_done_plf, }, outputs={ 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf, 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf, 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf, 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf, 'job_done': zcat_done_plf, }, ) wf.addTask(make_task(task_cns_zcat)) wf.refreshTargets()
def main(argv=sys.argv): global LOG LOG = support.setup_logger(None) if len(sys.argv) < 2: print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment' sys.exit(1) config_fn = sys.argv[1] config_absbasedir = os.path.dirname(os.path.abspath(config_fn)) config = ConfigParser.ConfigParser() config.read(config_fn) job_type = 'SGE' if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') job_queue = 'default' if config.has_option('General', 'job_queue'): job_queue = config.get('General', 'job_queue') pwatcher_type = 'fs_based' if config.has_option('General', 'pwatcher_type'): pwatcher_type = config.get('General', 'pwatcher_type') sge_track_reads = ' -pe smp 12 -q bigmem' if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = ' -pe smp 24 -q bigmem ' if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/' if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = 'input_bam.fofn' if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') if not os.path.isabs(input_bam_fofn): input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn) quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = {'job_type': job_type, 'job_queue': job_queue, 'sge_quiver': sge_quiver, 'sge_track_reads': sge_track_reads, 'input_bam_fofn': input_bam_fofn, 'pwatcher_type': pwatcher_type, 'smrt_bin': smrt_bin} LOG.info('config={}'.format(pprint.pformat(config))) #support.job_type = 'SGE' #tmp hack until we have a configuration parser wf = PypeProcWatcherWorkflow( max_jobs=quiver_concurrent_jobs, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) abscwd = os.path.abspath('.') parameters = { 'sge_option': config['sge_track_reads'], } input_bam_fofn_fn = config['input_bam_fofn'] input_bam_fofn_plf = makePypeLocalFile(input_bam_fofn_fn) hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention track_reads_h_done_plf = makePypeLocalFile('./4-quiver/reads/track_reads_h_done') make_track_reads_task = PypeTask(inputs = { 'input_bam_fofn': input_bam_fofn_plf, 'hasm_done': hasm_done_plf}, outputs = {'job_done': track_reads_h_done_plf}, parameters = parameters, ) track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json') parameters = { 'config': config, } make_task = PypeTask( inputs = { 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'), 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'), 'track_reads_h_done': track_reads_h_done_plf, }, outputs = { 'scattered_quiver_json': scattered_quiver_plf, }, parameters = parameters, ) wf.addTask(make_task(task_scatter_quiver)) wf.refreshTargets() p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(wf, scattered_quiver_plf) gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt') gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt') gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done') mkdir('4-quiver/cns_gather') with open(fn(gathered_p_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) with open(fn(gathered_h_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) make_task = PypeTask( inputs = job_done_plfs, outputs = { 'job_done': gather_done_plf, }, parameters = {}, ) wf.addTask(make_task(task_gather_quiver)) wf.refreshTargets() cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta') cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq') cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta') cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq') zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done') make_task = PypeTask( inputs = { 'gathered_p_ctg': gathered_p_ctg_plf, 'gathered_h_ctg': gathered_h_ctg_plf, 'gather_done': gather_done_plf, }, outputs = { 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf, 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf, 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf, 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf, 'job_done': zcat_done_plf, }, ) wf.addTask(make_task(task_cns_zcat)) wf.refreshTargets()
ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 phased_reads = makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads")) for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "phased_reads" : phased_reads}, outputs = { "rawread_to_contig_file": rawread_to_contig_file }, TaskType = PypeThreadTaskBase, URL = "task://localhost/r_read_to_contigs.%s" % idx ) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg) wf.addTask( dump_rawread_to_ctg_task )
def dump_rawread_ids(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) os.system( "DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (rawread_db, rawread_id_file))
def run(wf, config, input_config_fn, input_fofn_plf, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath('./0-rawreads') pread_dir = os.path.abspath('./1-preads_ovl') falcon_asm_dir = os.path.abspath('./2-asm-falcon') script_dir = os.path.abspath('./scripts') sge_log_dir = os.path.abspath('./sge_log') for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config['pa_concurrent_jobs'] wf.max_jobs = concurrent_jobs rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask(inputs = {'i_fofn': input_fofn_plf}, outputs = {'o_fofn': rawread_fofn_plf}, parameters = {}, ) fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config['input_type'] == 'raw': #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, 'sleep_done') ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, 'rdb_build_done') ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, 'run_jobs.sh') ) parameters = {'work_dir': rawread_dir, 'sge_option': config['sge_option_da'], 'config_fn': input_config_fn, 'config': config} length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, 'length_cutoff')) raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, '%s.db' % 'raw_reads')) make_build_rdb_task = PypeTask(inputs = {'input_fofn': rawread_fofn_plf}, outputs = {'rdb_build_done': rdb_build_done, 'raw_reads_db': raw_reads_db_plf, 'length_cutoff': length_cutoff_plf, 'run_jobs': run_jobs, }, parameters = parameters, ) build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs = { 'run_jobs_fn': run_jobs, 'db_build_done': rdb_build_done, }, outputs = { 'scatter_fn': scattered_plf, }, parameters = { 'db_prefix': 'raw_reads', 'nblock': raw_reads_nblock, 'pread_aln': False, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks(rawread_dir, scattered_plf) wf.addTasks(daligner_tasks) r_gathered_las_plf = makePypeLocalFile(os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt')) parameters = { 'nblock': raw_reads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {'gathered': r_gathered_las_plf}, parameters = parameters, ) check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs = { 'run_jobs': run_jobs, 'gathered_las': r_gathered_las_plf, }, outputs = { 'scattered': scattered_plf, }, parameters = { 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks(rawread_dir, scattered_plf) wf.addTasks(merge_tasks) task, _, las_fopfn_plf = create_merge_gather_task(os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'overlapping': sys.exit(0) # Produce new FOFN of preads fasta, based on consensus of overlaps. scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json') make_task = PypeTask( inputs = { 'gathered': las_fopfn_plf, 'db': raw_reads_db_plf, }, outputs = { 'scattered': scattered_plf, }, parameters = { 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_consensus_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) tasks, consensus_out = create_consensus_tasks(rawread_dir, scattered_plf) wf.addTasks(tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) task, preads_fofn_plf = create_consensus_gather_task(os.path.join(rawread_dir, 'preads'), consensus_out) wf.addTask(task) rdir = os.path.join(rawread_dir, 'report') pre_assembly_report_plf = makePypeLocalFile(os.path.join(rdir, 'pre_assembly_stats.json')) parameters = dict(config) parameters['cwd'] = rdir make_task = PypeTask( inputs = {'length_cutoff_fn': length_cutoff_plf, 'raw_reads_db': raw_reads_db_plf, 'preads_fofn': preads_fofn_plf, }, outputs = {'pre_assembly_report': pre_assembly_report_plf, }, parameters = parameters, ) task = make_task(pype_tasks.task_report_pre_assembly) wf.addTask(task) concurrent_jobs = config['cns_concurrent_jobs'] wf.max_jobs = concurrent_jobs wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'pre-assembly': log.info('Quitting after stage-0 for "pre-assembly" target.') sys.exit(0) # build pread database if config['input_type'] == 'preads': preads_fofn_plf = makePypeLocalFile(os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask(inputs = {'i_fofn': rawread_fofn_plf}, outputs = {'o_fofn': preads_fofn_plf}, parameters = {}, ) fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, 'pdb_build_done') ) parameters = {'work_dir': pread_dir, 'sge_option': config['sge_option_pda'], 'config_fn': input_config_fn, 'config': config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs = {'preads_fofn': preads_fofn_plf }, outputs = {'pdb_build_done': pdb_build_done, 'preads_db': preads_db, 'run_jobs': run_jobs, }, parameters = parameters, ) build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config['sge_option_da'] = config['sge_option_pda'] scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs = { 'run_jobs_fn': run_jobs, 'db_build_done': pdb_build_done, }, outputs = { 'scatter_fn': scattered_plf, }, parameters = { 'db_prefix': 'preads', 'nblock': preads_nblock, 'pread_aln': True, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, scattered_plf) wf.addTasks(daligner_tasks) p_gathered_las_plf = makePypeLocalFile(os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt')) parameters = { 'nblock': preads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {'gathered': p_gathered_las_plf}, parameters = parameters, ) check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. config['sge_option_la'] = config['sge_option_pla'] scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs = { 'run_jobs': run_jobs, 'gathered_las': p_gathered_las_plf, }, outputs = { 'scattered': scattered_plf, }, parameters = { 'db_prefix': 'preads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir, scattered_plf) wf.addTasks(merge_tasks) task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) concurrent_jobs = config['ovlp_concurrent_jobs'] wf.max_jobs = concurrent_jobs wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_dir = os.path.join(pread_dir, 'db2falcon') db2falcon_done = makePypeLocalFile(os.path.join(db2falcon_dir, 'db2falcon_done')) preads4falcon_plf = makePypeLocalFile(os.path.join(db2falcon_dir, 'preads4falcon.fasta')) make_run_db2falcon = PypeTask( inputs = {'las_fofn_plf': las_fofn_plf, 'preads_db': preads_db, }, outputs = {'db2falcon_done': db2falcon_done, 'preads4falcon': preads4falcon_plf, }, parameters = {'wd': db2falcon_dir, 'config': config, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done')) make_run_falcon_asm = PypeTask( inputs = {'db2falcon_done': db2falcon_done, 'db_file': preads_db, 'preads4falcon': preads4falcon_plf, 'las_fofn': las_fofn_plf, }, outputs = {'falcon_asm_done': falcon_asm_done}, parameters = {'wd': falcon_asm_dir, 'config': config, 'pread_dir': pread_dir, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
def dump_rawread_ids(self): rawread_db = fn( self.rawread_db ) rawread_id_file = fn( self.rawread_id_file ) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )
def run( wf, config, rule_writer, config_fn, input_fofn_plf, ): """ Preconditions (for now): * LOG * run_support.logger """ parsed_config = io.deserialize(config_fn) if parsed_config != config: msg = 'Config from {!r} != passed config'.format(config_fn) raise Exception(msg) general_config = config['General'] general_config_fn = os.path.join(os.path.dirname(config_fn), 'General_config.json') io.serialize(general_config_fn, general_config) # Some tasks use this. rawread_dir = '0-rawreads' pread_dir = '1-preads_ovl' falcon_asm_dir = '2-asm-falcon' for d in (rawread_dir, pread_dir, falcon_asm_dir): support.make_dirs(d) # only matter for parallel jobs job_defaults = config['job.defaults'] exitOnFailure = bool(job_defaults.get('stop_all_jobs_on_failure', False)) default_njobs = int(job_defaults.get('njobs', 7)) wf.max_jobs = default_njobs assert general_config['input_type'] in ( 'raw', 'preads'), 'Invalid input_type=={!r}'.format( general_config['input_type']) # Store config as JSON, available to many tasks. if general_config['input_type'] == 'raw': parameters = {} # import sequences into daligner DB # calculate length_cutoff (if specified as -1) # split DB # run DBdust r_db_dust_fn = os.path.join(rawread_dir, 'build', 'raw_reads.db') length_cutoff_fn = os.path.join(rawread_dir, 'build', 'length_cutoff') wf.addTask( gen_task( script=pype_tasks.TASK_DB_BUILD_SCRIPT, inputs={ 'config': general_config_fn, 'input_fofn': fn(input_fofn_plf), }, outputs={ 'length_cutoff': length_cutoff_fn, 'db': r_db_dust_fn, # Also .raw_reads.*, of course. And dust track. }, parameters=dict(), rule_writer=rule_writer, dist=Dist(NPROC=1), )) # run TANmask tan_uows_fn = os.path.join(rawread_dir, 'tan-split', 'tan-uows.json') tan_bash_template_fn = os.path.join(rawread_dir, 'tan-split', 'bash_template.sh') wf.addTask( gen_task( script=pype_tasks.TASK_DB_TAN_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_dust_fn, }, outputs={ 'split': tan_uows_fn, 'bash_template': tan_bash_template_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(NPROC=1), )) gathered_fn = os.path.join(rawread_dir, 'tan-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, tan_uows_fn, gathered_fn, run_dict=dict( bash_template_fn=tan_bash_template_fn, script= 'fubar-TODO', #pype_tasks.TASK_DB_TAN_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': '0-rawreads/tan-chunks/{tan0_id}/some-units-of-work.json', }, outputs={ #'job_done': '0-rawreads/{dal0_id}/daligner.done', 'results': '0-rawreads/tan-runs/{tan0_id}/some-done-files.json', }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']), ) r_db_tan_fn = os.path.join(rawread_dir, 'tan-combine', 'raw_reads.db') wf.addTask( gen_task( script=pype_tasks.TASK_DB_TAN_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_dust_fn, 'gathered': gathered_fn, }, outputs={ 'new_db': r_db_tan_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) # run daligner wf.max_jobs = config['job.step.da'].get('njobs', default_njobs) #rawreads_db_fn = os.path.join(rawread_dir, 'raw_reads.db') daligner_all_units_fn = os.path.join(rawread_dir, 'daligner-split', 'all-units-of-work.json') daligner_bash_template_fn = os.path.join(rawread_dir, 'daligner-split', 'daligner_bash_template.sh') params = dict(parameters) #params['db_prefix'] = 'raw_reads' #params['pread_aln'] = 0 params['skip_checks'] = int(general_config.get('skip_checks', 0)) params['wildcards'] = 'dal0_id' wf.addTask( gen_task( script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_tan_fn, 'length_cutoff': length_cutoff_fn, }, outputs={ 'split': daligner_all_units_fn, 'bash_template': daligner_bash_template_fn }, parameters=params, rule_writer=rule_writer, dist=Dist( local=True, NPROC=4), # really, NPROC=1, but we need to know the max )) gathered_fn = os.path.join(rawread_dir, 'daligner-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, daligner_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=daligner_bash_template_fn, script=pype_tasks. TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': os.path.join( rawread_dir, 'daligner-chunks/{dal0_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join( rawread_dir, 'daligner-runs/{dal0_id}/some-done-files.json'), }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.da']), ) r_gathered_las_fn = os.path.join(rawread_dir, 'daligner-combine', 'gathered-las.json') wf.addTask( gen_task( script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': r_db_tan_fn, 'gathered': gathered_fn, }, outputs={ 'las_paths': r_gathered_las_fn, }, parameters={}, rule_writer=rule_writer, #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.da']) dist=Dist(local=True), )) # Merge .las files. wf.max_jobs = config['job.step.la'].get('njobs', default_njobs) las_merge_all_units_fn = os.path.join(rawread_dir, 'las-merge-split', 'all-units-of-work.json') bash_template_fn = os.path.join(rawread_dir, 'las-merge-split', 'las-merge-bash-template.sh') params = dict(parameters) params['db_prefix'] = 'raw_reads' params['wildcards'] = 'mer0_id' wf.addTask( gen_task( script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'las_paths': r_gathered_las_fn, }, outputs={ 'split': las_merge_all_units_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(rawread_dir, 'las-merge-gathered', 'gathered.json') gen_parallel_tasks( wf, rule_writer, las_merge_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks. TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake inputs={ #'las_paths': './0-rawreads/merge-scripts/{mer0_id}/las_paths.json', #'merge_script': './0-rawreads/merge-scripts/{mer0_id}/merge-script.sh', #'merged_las_json': './0-rawreads/merge-scripts/{mer0_id}/merged_las.json', 'units_of_work': '0-rawreads/las-merge-chunks/{mer0_id}/some-units-of-work.json', }, outputs={ #'merged_las': './0-rawreads/{mer0_id}/merged.las', #'job_done': './0-rawreads/{mer0_id}/merge.done', 'results': '0-rawreads/las-merge-runs/{mer0_id}/some-las-paths.json', }, parameters={}, ), dist=Dist(NPROC=1, job_dict=config['job.step.la']), ) p_id2las_fn = os.path.join(rawread_dir, 'las-merge-combine', 'p_id2las.json') las_fofn_fn = os.path.join(rawread_dir, 'las-merge-combine', 'las_fofn.json') wf.addTask( gen_task( script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'gathered': gathered_fn, }, outputs={ 'block2las': p_id2las_fn, 'las_paths': las_fofn_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) if general_config['target'] == 'overlapping': sys.exit(0) # Produce new FOFN of preads fasta, based on consensus of overlaps. wf.max_jobs = config['job.step.cns'].get('njobs', default_njobs) split_fn = os.path.join(rawread_dir, 'cns-split', 'split.json') bash_template_fn = os.path.join(rawread_dir, 'cns-split', 'consensus-bash-template.sh') params = dict(parameters) params['wildcards'] = 'cns0_id,cns0_id2' wf.addTask( gen_task( script=pype_tasks.TASK_CONSENSUS_SPLIT_SCRIPT, inputs={ 'p_id2las': p_id2las_fn, 'raw_reads_db': r_db_tan_fn, 'length_cutoff': length_cutoff_fn, 'config': general_config_fn, }, outputs={ 'split': split_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(rawread_dir, 'cns-gather', 'gathered.json') gen_parallel_tasks( wf, rule_writer, split_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks. TASK_CONSENSUS_TASK_SCRIPT, # for snakemake only inputs={ #'las': '0-rawreads/cns-split/{cns0_id}/merged.{cns0_id2}.las', #'db': r_db_tan_fn, #'length_cutoff': length_cutoff_fn, #'config': general_config_fn, 'units_of_work': '0-rawreads/cns-chunks/{cns0_id}/some-units-of-work.json', }, outputs={ #'fasta': '0-rawreads/consensus/{cns0_id}/consensus.{cns0_id2}.fasta', 'results': '0-rawreads/cns-runs/{cns0_id}/some-done-files.json', }, parameters={}, ), dist=Dist(NPROC=6, job_dict=config['job.step.cns']), ) preads_fofn_fn = os.path.join(rawread_dir, 'preads', 'input_preads.fofn') wf.addTask( gen_task( script=pype_tasks.TASK_CONSENSUS_GATHER_SCRIPT, inputs={ 'gathered': gathered_fn, }, outputs={ 'preads_fofn': preads_fofn_fn, }, parameters=parameters, #{}, rule_writer=rule_writer, dist=Dist(local=True), )) rdir = os.path.join(rawread_dir, 'report') pre_assembly_report_fn = os.path.join(rdir, 'pre_assembly_stats.json') params = dict(parameters) params['length_cutoff_user'] = general_config['length_cutoff'] params['genome_length'] = general_config[ 'genome_size'] # note different name; historical wf.addTask( gen_task( script=pype_tasks.TASK_REPORT_PRE_ASSEMBLY_SCRIPT, inputs={ 'length_cutoff': length_cutoff_fn, 'raw_reads_db': r_db_tan_fn, 'preads_fofn': preads_fofn_fn, 'config': general_config_fn, }, outputs={ 'pre_assembly_report': pre_assembly_report_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) if general_config['target'] == 'pre-assembly': LOG.info('Quitting after stage-0 for "pre-assembly" target.') sys.exit(0) # build pread database if general_config['input_type'] == 'preads': """ preads_fofn_plf = makePypeLocalFile(os.path.join( pread_dir, 'preads-fofn-abs', os.path.basename(general_config['input_fofn']))) make_fofn_abs_task = PypeTask(inputs={'i_fofn': input_fofn_plf}, outputs={'o_fofn': preads_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task( pype_tasks.task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) """ raise Exception('TODO') pdb_build_done = os.path.join(pread_dir, 'pdb_build_done') run_jobs_fn = os.path.join(pread_dir, 'run_jobs.sh') preads_db_fn = os.path.join(pread_dir, 'build', 'preads.db') length_cutoff_pr_fn = os.path.join(pread_dir, 'build', 'length_cutoff') wf.addTask( gen_task( script=pype_tasks.TASK_DB_BUILD_SCRIPT, inputs={ 'config': general_config_fn, 'input_fofn': preads_fofn_fn, }, outputs={ 'length_cutoff': length_cutoff_pr_fn, 'db': preads_db_fn, # Also .preads.*, of course. }, parameters=dict(), rule_writer=rule_writer, dist=Dist(NPROC=1), )) # run daligner wf.max_jobs = config['job.step.pda'].get('njobs', default_njobs) daligner_all_units_fn = os.path.join(pread_dir, 'daligner-split', 'all-units-of-work.json') daligner_bash_template_fn = os.path.join(pread_dir, 'daligner-split', 'daligner_bash_template.sh') params = dict(parameters) params['skip_checks'] = int(general_config.get('skip_checks', 0)) params['wildcards'] = 'dal1_id' wf.addTask( gen_task( script=pype_tasks.TASK_DB_DALIGNER_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'db': preads_db_fn, #not tan, yet 'length_cutoff': length_cutoff_pr_fn, }, outputs={ 'split': daligner_all_units_fn, 'bash_template': daligner_bash_template_fn }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True, NPROC=4), # really, NPROC=1, but we need to know the max )) gathered_fn = os.path.join(pread_dir, 'daligner-gathered', 'gathered-done-files.json') gen_parallel_tasks( wf, rule_writer, daligner_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=daligner_bash_template_fn, script=pype_tasks. TASK_DB_DALIGNER_APPLY_SCRIPT, # for snakemake stuff inputs={ 'units_of_work': os.path.join( pread_dir, 'daligner-chunks/{dal1_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join(pread_dir, 'daligner-runs/{dal1_id}/some-done-files.json'), }, parameters={}, ), dist=Dist(NPROC=4, MB=4000, job_dict=config['job.step.pda']), ) gathered_las_fn = os.path.join(pread_dir, 'daligner-combine', 'gathered-las.json') wf.addTask( gen_task( script=pype_tasks.TASK_DB_DALIGNER_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'db': preads_db_fn, #r_db_tan_fn, 'gathered': gathered_fn, }, outputs={ 'las_paths': gathered_las_fn, }, parameters={}, rule_writer=rule_writer, #dist=Dist(NPROC=1, MB=4000, job_dict=config['job.step.pda']) dist=Dist(local=True), )) # Merge .las files. wf.max_jobs = config['job.step.pla'].get('njobs', default_njobs) las_merge_all_units_fn = os.path.join(pread_dir, 'las-merge-split', 'all-units-of-work.json') bash_template_fn = os.path.join(pread_dir, 'las-merge-split', 'las-merge-bash-template.sh') params = dict(parameters) params['db_prefix'] = 'preads' params['wildcards'] = 'mer1_id' wf.addTask( gen_task( script=pype_tasks.TASK_DB_LAMERGE_SPLIT_SCRIPT, inputs={ 'config': general_config_fn, 'las_paths': gathered_las_fn, }, outputs={ 'split': las_merge_all_units_fn, 'bash_template': bash_template_fn, }, parameters=params, rule_writer=rule_writer, dist=Dist(local=True), )) gathered_fn = os.path.join(pread_dir, 'las-merge-gathered', 'gathered.json') gen_parallel_tasks( wf, rule_writer, las_merge_all_units_fn, gathered_fn, run_dict=dict( bash_template_fn=bash_template_fn, script=pype_tasks.TASK_DB_LAMERGE_APPLY_SCRIPT, # for snakemake inputs={ 'units_of_work': os.path.join( pread_dir, 'las-merge-chunks/{mer0_id}/some-units-of-work.json'), }, outputs={ 'results': os.path.join(pread_dir, 'las-merge-runs/{mer0_id}/some-las-paths.json'), }, parameters={}, ), dist=Dist(NPROC=1, job_dict=config['job.step.la']), ) p_id2las_fn = os.path.join(pread_dir, 'las-merge-combine', 'block2las.json') las_fofn_fn = os.path.join(pread_dir, 'las-merge-combine', 'las_fofn.json') wf.addTask( gen_task( script=pype_tasks.TASK_DB_LAMERGE_COMBINE_SCRIPT, inputs={ 'config': general_config_fn, 'gathered': gathered_fn, }, outputs={ 'block2las': p_id2las_fn, 'las_paths': las_fofn_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(local=True), )) wf.max_jobs = config['job.step.asm'].get('njobs', default_njobs) db2falcon_dir = os.path.join(pread_dir, 'db2falcon') db2falcon_done_fn = os.path.join(db2falcon_dir, 'db2falcon_done') preads4falcon_fn = os.path.join(db2falcon_dir, 'preads4falcon.fasta') wf.addTask( gen_task( script=pype_tasks.TASK_RUN_DB_TO_FALCON_SCRIPT, inputs={ 'p_id2las': p_id2las_fn, 'preads_db': preads_db_fn, }, outputs={ 'job_done': db2falcon_done_fn, 'preads4falcon': preads4falcon_fn, }, parameters={}, rule_writer=rule_writer, dist=Dist(NPROC=4, job_dict=config['job.step.asm']), )) falcon_asm_done_fn = os.path.join(falcon_asm_dir, 'falcon_asm_done') for key in ('overlap_filtering_setting', 'length_cutoff_pr', 'fc_ovlp_to_graph_option'): parameters[key] = general_config[key] wf.addTask( gen_task( script=pype_tasks.TASK_RUN_FALCON_ASM_SCRIPT, inputs={ 'db2falcon_done': db2falcon_done_fn, 'db_file': preads_db_fn, 'preads4falcon_fasta': preads4falcon_fn, 'las_fofn': las_fofn_fn, 'config': general_config_fn, }, outputs={'falcon_asm_done': falcon_asm_done_fn}, parameters=parameters, rule_writer=rule_writer, dist=Dist(NPROC=4, job_dict=config['job.step.asm']), )) wf.refreshTargets() with io.cd('0-rawreads'): # for backwards-compatibility io.symlink('las-merge-combine', 'las-gather')