def get_read_ctg_map(rawread_dir, pread_dir, asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps')) make_dirs(read_map_dir) wf = PypeProcWatcherWorkflow(max_jobs=12, ) """ job_type=config['job_type'], job_queue=config['job_queue'], sge_option=config.get('sge_option', ''), watcher_type=config['pwatcher_type'], watcher_directory=config['pwatcher_directory']) """ rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db')) rawread_id_file = makePypeLocalFile( os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids')) task = PypeTask(inputs={'rawread_db': rawread_db}, outputs={'rawread_id_file': rawread_id_file}, TaskType=PypeThreadTaskBase, URL='task://localhost/dump_rawread_ids') wf.addTask(task(dump_rawread_ids)) pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) pread_id_file = makePypeLocalFile( os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids')) task = PypeTask(inputs={'pread_db': pread_db}, outputs={'pread_id_file': pread_id_file}, TaskType=PypeThreadTaskBase, URL='task://localhost/dump_pread_ids') wf.addTask(task(dump_pread_ids)) wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list')) utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data')) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths')) inputs = { 'rawread_id_file': rawread_id_file, 'pread_id_file': pread_id_file, 'sg_edges_list': sg_edges_list, 'utg_data': utg_data, 'ctg_paths': ctg_paths } read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, 'get_ctg_read_map', 'read_to_contig_map')) task = PypeTask(inputs=inputs, outputs={'read_to_contig_map': read_to_contig_map}, TaskType=PypeThreadTaskBase, URL='task://localhost/get_ctg_read_map') wf.addTask(task(generate_read_to_ctg_map)) wf.refreshTargets() # block
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters): """Create a pbalign task for each chunk, plus a gathering task. """ tasks = list() gathering = dict() chunk_dir = os.path.dirname(fn(chunk_json_pfn)) for i, subreadset_fn in enumerate( sorted( yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)), '$chunk.subreadset_id'))): wdir = 'run-pbalign-{:02d}'.format(i) subreadset_fn = os.path.join(chunk_dir, os.path.basename(subreadset_fn)) subreadset_pfn = makePypeLocalFile(subreadset_fn) unmapped_pfn = makePypeLocalFile( '{wdir}/unmapped.txt'.format(**locals())) alignmentset_pfn = makePypeLocalFile( '{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format( **locals())) gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn """Also produces: aligned.subreads.i.alignmentset.bam aligned.subreads.i.alignmentset.bam.bai aligned.subreads.i.alignmentset.bam.pbi """ make_task = PypeTask( inputs={ "chunk_json": chunk_json_pfn, "dataset": subreadset_pfn, "referenceset": referenceset_pfn, }, outputs={ "alignmentset": alignmentset_pfn, "unmapped": unmapped_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign) tasks.append(task) o_alignmentset_pfn = makePypeLocalFile( 'run-pbalign_gather/aligned.subreads.alignmentset.xml') o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt') make_task = PypeTask( inputs=gathering, outputs={ "o_ds": o_alignmentset_pfn, "o_unmapped": o_unmapped_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign_gather) tasks.append(task) return tasks, alignmentset_pfn
def create_consensus_tasks(basedir, scatter_fn): consensus_tasks = [] consensus_out = {} content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] aligner = parameters['config']['aligner'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = int(parameters['job_id']) cns_label = 'cns_%05d' % int(p_id) wdir = os.path.join(basedir, 'preads', cns_label) if aligner == 'minialign': for input_fn, input_fpath in inputs.items(): inputs[input_fn] = makePypeLocalFile(input_fpath) make_c_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, wdir=wdir, ) c_task = make_c_task(pype_tasks.task_run_consensus) consensus_tasks.append(c_task) consensus_out['cjob_%d' % p_id] = outputs['out_file'] return consensus_tasks, consensus_out
def create_merge_tasks(basedir, scatter_fn): tasks = [] p_ids_merged_las = {} # for consensus content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] aligner = parameters['config']['aligner'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = parameters['job_id'] #merge_script = parameters['merge_script'] #sge_option = parameters['sge_option'] wdir = os.path.join(basedir, 'm_%05d' % p_id) if aligner == 'minialign': for input_fn, input_fpath in inputs.items(): inputs[input_fn] = makePypeLocalFile(input_fpath) make_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, wdir=wdir, ) task = make_task(pype_tasks.task_run_las_merge if aligner == 'daligner' else pype_tasks.task_run_aligner) tasks.append(task) ovl_file = task.outputs[ 'merged_las' if aligner == 'daligner' else 'ovl_fn'] # these are relative, so we need the PypeLocalFiles p_ids_merged_las[p_id] = ovl_file return tasks, p_ids_merged_las
def create_daligner_tasks(basedir, scatter_fn): tasks = [] tasks_out = {} try: content = json.loads(open(scatter_fn).read()) # array of descriptions except Exception: msg = 'Failed to read JSON from {!r}'.format(scatter_fn) LOG.exception(msg) raise Exception(msg) for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = os.path.abspath(scatter_fn) outputs = section['outputs'] URL = section['URL'] job_uid = parameters['job_uid'] wdir = os.path.join(basedir, 'job_%s' % job_uid) make_daligner_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, wdir=wdir, ) daligner_task = make_daligner_task(pype_tasks.task_run_daligner) tasks.append(daligner_task) # these are relative, so we need the PypeLocalFiles tasks_out['ajob_%s' % job_uid] = daligner_task.outputs['job_done'] return tasks, tasks_out
def create_merge_tasks(basedir, scatter_fn): tasks = [] p_ids_merged_las = {} # for consensus content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = parameters['job_id'] #merge_script = parameters['merge_script'] #sge_option = parameters['sge_option'] wdir = os.path.join(basedir, 'm_%05d' % p_id) make_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, wdir=wdir, ) task = make_task(pype_tasks.task_run_las_merge) tasks.append(task) las_fn = task.outputs[ 'merged_las'] # these are relative, so we need the PypeLocalFiles p_ids_merged_las[p_id] = las_fn return tasks, p_ids_merged_las
def get_read_hctg_map(asm_dir, hasm_dir, read_to_contig_map_fn): wf = PypeProcWatcherWorkflow( max_jobs= 12, # TODO: Why was NumThreads ever set? There is only one task! ) rawread_id_file = makePypeLocalFile( os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids')) pread_id_file = makePypeLocalFile( os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids')) h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges')) p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges')) h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids")) #make_dirs(os.path.dirname(os.path.abspath(read_to_contig_map_fn)) # Workflow does this. read_to_contig_map_plf = makePypeLocalFile(read_to_contig_map_fn) inputs = { 'rawread_id_file': rawread_id_file, 'pread_id_file': pread_id_file, 'h_ctg_edges': h_ctg_edges, 'p_ctg_edges': p_ctg_edges, 'h_ctg_ids': h_ctg_ids } make_task = PypeTask( inputs=inputs, outputs={'read_to_contig_map': read_to_contig_map_plf}, ) wf.addTask(make_task(generate_read_to_hctg_map)) wf.refreshTargets() # block
def create_quiver_jobs(scattered_quiver_plf): scattered_quiver_fn = fn(scattered_quiver_plf) jobs = json.loads(open(scattered_quiver_fn).read()) #ctg_ids = sorted(jobs['ref_seq_data']) p_ctg_out = [] h_ctg_out = [] job_done_plfs = {} for job in jobs: ctg_id = job['ctg_id'] m_ctg_id = ctg_id.split('-')[0] wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id) ref_fasta = makePypeLocalFile( os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), './4-quiver/reads/' '{ctg_id}.sam'.format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id=ctg_id))) if os.path.exists( fn(read_sam )): # TODO(CD): Ask Jason what we should do if missing SAM. if ctg_types[ctg_id] == 'p': p_ctg_out.append((fn(cns_fasta), fn(cns_fastq))) elif ctg_types[ctg_id] == 'h': h_ctg_out.append((fn(cns_fasta), fn(cns_fastq))) else: LOG.warning( 'Type is {!r}, not "p" or "h". Why are we running Quiver?'. format(ctg_types[ctg_id])) parameters = { 'job_uid': 'q-' + ctg_id, 'wd': wd, 'config': config, 'ctg_id': ctg_id } make_quiver_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'read_sam': read_sam, 'scattered_quiver': scattered_quiver_plf, }, outputs={ 'cns_fasta': cns_fasta, 'cns_fastq': cns_fastq, 'job_done': job_done }, parameters=parameters, ) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) job_done_plfs['{}'.format(ctg_id)] = job_done #sge_quiver = config['sge_quiver'] return p_ctg_out, h_ctg_out, job_done_plfs
def create_consensus_gather_task(wd, inputs): # Happens only in stage-0. preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn')) make_cns_gather_task = PypeTask( inputs=inputs, # consensus_out outputs={'preads_fofn': preads_fofn_plf}, ) task = make_cns_gather_task(pype_tasks.task_cns_gather) return task, preads_fofn_plf
def create_ma_merge_gather_task(wd, inputs): ovl_fofn_plf = makePypeLocalFile(os.path.join(wd, 'ovl.fofn')) make_task = PypeTask( inputs=inputs, # p_ids_merged_las outputs={ 'ovl_fofn': ovl_fofn_plf, }, ) task = make_task(pype_tasks.task_ma_merge_gather) return task, ovl_fofn_plf
def create_consensus_gather_task(wd, inputs): # Happens only in stage-0. preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn')) make_cns_gather_task = PypeTask( inputs = inputs, # consensus_out outputs = {'preads_fofn': preads_fofn_plf}, TaskType = MyFakePypeThreadTaskBase, URL = 'task://localhost/cns_gather' ) task = make_cns_gather_task(pype_tasks.task_cns_gather) return task, preads_fofn_plf
def main(): lfn = 'logging-cfg.json' if os.path.exists(lfn): logging.config.dictConfig(json.load(open(lfn))) else: logging.basicConfig() logging.getLogger().setLevel(logging.NOTSET) try: import logging_tree logging_tree.printout() except ImportError: pass log.debug('DEBUG LOGGING ON') log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format( JOB_TYPE, SLEEP_S)) exitOnFailure = False concurrent_jobs = 2 Workflow = PypeProcWatcherWorkflow wf = Workflow(job_type=JOB_TYPE) wf.max_jobs = concurrent_jobs par = dict(sleep_s=SLEEP_S) DIR = 'mytmp' makedirs(DIR) f0 = makePypeLocalFile('mytmp/f0') f1 = makePypeLocalFile('mytmp/f1') make_task = PypeTask( inputs={}, outputs={'f0': f0}, parameters=par, ) task = make_task(taskrun0) wf.addTasks([task]) make_task = PypeTask( inputs={'f0': f0}, outputs={'f1': f1}, parameters=par, ) task = make_task(taskrun1) wf.addTasks([task]) wf.refreshTargets([task])
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters): tasks = list() next_inputs = dict() topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)), 'run-fastas2fofn') # Create the fastas in parallel. for i, chunk_fn in enumerate( open(fn(split_subreadsets_fofn_pfn)).read().splitlines()): wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i)) # TODO: 02 chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn)) fasta_done_fn = os.path.join(wdir, 'chunk_{:03d}_done'.format(i)) # TODO: 02 # By depending on a sentinel, we are allowed to delete fastas later. # Note: i might not match num in chunk_fn, but that is ok fasta_done_pfn = makePypeLocalFile(fasta_done_fn) make_task = PypeTask( inputs={ "dataset": chunk_pfn, }, outputs={ "fasta_done": fasta_done_pfn, }, parameters=parameters, ) task = make_task(start_task.task_bam2fasta_dexta) tasks.append(task) next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn #fasta_fn = base_from_done(fasta_done_fn) + '.fasta' # By convention. # Create the FOFN of fastas. fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn') fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn) make_task = PypeTask( inputs=next_inputs, outputs={ "fofn": fasta_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_fastas2fofn) tasks.append(task) return tasks, fasta_fofn_pfn
def run( wf, config, ): exitOnFailure = True #try: # # Make it always re-run. # os.remove('out.txt') #except Exception: # LOG.exception('could not remove out.txt') o0 = makePypeLocalFile('hey0/out.txt') make_task = PypeTask( inputs={}, outputs={'o0': o0}, parameters={}, ) t0 = make_task(mymod.say_hey0) o1 = makePypeLocalFile('hey1/out.txt') make_task = PypeTask( inputs={'i0': o0}, outputs={'o1': o1}, parameters={}, ) t1 = make_task(mymod.say_hey1) wf.addTasks([t0, t1]) # for new-simple-way, we could add just t1 N = int(os.environ.get('N', '1')) for i in range(N): make_task = PypeTask( inputs={}, outputs={ 'out': 'touched', }, #outputs = {'out': 'hey-{}/touched'.format(i),}, parameters={}, wdir='hey-{}'.format(i), ) t = make_task(mymod.touchit) wf.addTask(t) wf.refreshTargets(exitOnFailure=exitOnFailure)
def create_merge_gather_task(wd, inputs): las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn')) las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn')) make_task = PypeTask(inputs = inputs, # p_ids_merged_las outputs = {'las_fofn': las_fofn_plf, 'las_fopfn': las_fopfn_plf, }, TaskType = MyFakePypeThreadTaskBase, ) # URL = 'task://localhost/pmerge_gather') task = make_task(pype_tasks.task_merge_gather) return task, las_fofn_plf, las_fopfn_plf
def create_task_old(): i1 = './in/i1' o1 = './run/dir1/o1.txt' i1 = makePypeLocalFile(i1) o1 = makePypeLocalFile(o1) parameters = {} make_task = PypeTask( inputs={ 'i1': i1, }, outputs={ 'o1': o1, }, parameters=parameters, ) return make_task(taskA)
def gen_task(script, inputs, outputs, parameters={}): def validate_dict(mydict): "Python identifiers are illegal as keys." try: collections.namedtuple('validate', mydict.keys()) except ValueError as exc: LOG.exception('Bad key name in task definition dict {!r}'.format(mydict)) raise validate_dict(inputs) validate_dict(outputs) validate_dict(parameters) parameters['_bash_'] = script make_task = PypeTask( inputs={k: makePypeLocalFile(v) for k,v in inputs.iteritems()}, outputs={k: makePypeLocalFile(v) for k,v in outputs.iteritems()}, parameters=parameters, ) return make_task(task_generic_bash_script)
def create_daligner_tasks(basedir, scatter_fn): tasks = [] tasks_out = {} content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] job_uid = parameters['job_uid'] wdir = os.path.join(basedir, 'job_%s' % job_uid) make_daligner_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, wdir=wdir, ) daligner_task = make_daligner_task(pype_tasks.task_run_daligner) tasks.append(daligner_task) tasks_out['ajob_%s' % job_uid] = daligner_task.outputs[ 'job_done'] # these are relative, so we need the PypeLocalFiles return tasks, tasks_out
def create_consensus_tasks(basedir, scatter_fn): consensus_tasks = [] consensus_out ={} content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = int(parameters['job_id']) cns_label = 'cns_%05d' %int(p_id) wdir = os.path.join(basedir, 'preads', cns_label) make_c_task = PypeTask(inputs = inputs, outputs = outputs, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = URL, wdir = wdir, ) c_task = make_c_task(pype_tasks.task_run_consensus) consensus_tasks.append(c_task) consensus_out['cjob_%d' % p_id] = outputs['out_file'] return consensus_tasks, consensus_out
def flow(config): #import pdb; pdb.set_trace() parameters = config #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs #wf.refreshTargets(exitOnFailure=exitOnFailure) #wf = PypeThreadWorkflow() #wf = PypeWorkflow() #wf = PypeWorkflow(job_type='local') log.debug('config=\n{}'.format(pprint.pformat(config))) # Set some defaults on the Workflow. concurrent_jobs = 24 # TODO: Configure this. wf = PypeWorkflow( job_type=config['hgap'].get('job_type'), job_queue=config['hgap'].get('job_queue'), watcher_type=config['hgap'].get('pwatcher_type', 'blocking'), #watcher_directory=config['pwatcher_directory'], max_jobs=config['hgap'].get('max_jobs', concurrent_jobs), ) use_tmpdir = config['hgap'].get('use_tmpdir') if use_tmpdir: log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir)) if use_tmpdir is not True and '/' in use_tmpdir: tempfile.tempdir = use_tmpdir log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir)) else: log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir)) dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0]) filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml') make_task = PypeTask( inputs={ "dataset": dataset_pfn, }, outputs={ "filtered": filtered_pfn, }, parameters=parameters, ) task = make_task(start_task.task_filterbam) wf.addTask(task) split_subreadsets_fofn_pfn = makePypeLocalFile( 'run-bam_scatter/chunked_subreadsets.fofn') make_task = PypeTask( inputs={ "dataset": filtered_pfn, }, outputs={ "split_subreadsets_fofn": split_subreadsets_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_bam_scatter) wf.addTask(task) wf.refreshTargets() tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg') fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json') make_task = PypeTask( inputs={ "input_fofn": input_fofn_pfn, }, outputs={ "fc_cfg": fc_cfg_pfn, "fc_json_config": fc_json_config_pfn, }, parameters=parameters, ) task = make_task(start_task.task_prepare_falcon) wf.addTask(task) wf.refreshTargets() input_config_fn = fn(fc_cfg_pfn) with sys.cd('run-falcon'): falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger( 'falcon') fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg( falcon_kit.run_support.parse_config(input_config_fn)) # FALCON takes over the workflow for a while. # (For debugging, it is still possible to restart just fc_run, if desired.) falcon_asm_done_pfn = falcon_kit.mains.run1.run( wf, fc_cfg, input_config_fn, input_fofn_plf=input_fofn_pfn, # _pfn should be _plf, but oh well ) wf.max_jobs = concurrent_jobs # in case Falcon changed this # Here is a hard-linking task to help us attach falcon into the dependency graph. falcon_link_done_pfn = makePypeLocalFile( 'run-falcon_link/falcon_link_done') make_task = PypeTask( inputs={ "falcon_asm_done": falcon_asm_done_pfn, }, outputs={ "falcon_link_done": falcon_link_done_pfn, }, parameters=parameters, ) task = make_task(start_task.task_falcon_link) wf.addTask(task) # The rest of the workflow will operate on datasets, not fasta directly. referenceset_pfn = makePypeLocalFile( 'run-fasta2referenceset/asm.referenceset.xml') make_task = PypeTask( inputs={ "falcon_link_done": falcon_link_done_pfn, }, outputs={ "referenceset": referenceset_pfn, }, parameters=parameters, ) task = make_task(start_task.task_fasta2referenceset) wf.addTask(task) wf.refreshTargets() # scatter the subreads for pbalign """Produces: pbalign_chunk.json chunk_subreadset_*.subreadset.xml """ pbalign_chunk_json_pfn = makePypeLocalFile( 'run-pbalign-scatter/pbalign_chunk.json') make_task = PypeTask( inputs={ "dataset": dataset_pfn, "referenceset": referenceset_pfn, }, outputs={ "out_json": pbalign_chunk_json_pfn, }, parameters=parameters, ) task = make_task(start_task.task_pbalign_scatter) wf.addTask(task) wf.refreshTargets() # After scattering, we can specify the pbalign jobs. tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # scatter the alignmentset for genomic_consensus (variantCaller) """Produces: gc.chunks.fofn ???*.congitset.xml ??? """ gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn') make_task = PypeTask( inputs={ "alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn, }, outputs={ "out_fofn": gc_chunks_fofn_pfn, }, parameters=parameters, ) task = make_task(start_task.task_gc_scatter) wf.addTask(task) wf.refreshTargets() tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc( gc_chunks_fofn_pfn, referenceset_pfn, parameters) wf.addTasks(tasks) wf.refreshTargets() # Final report polished_assembly_report_json_pfn = makePypeLocalFile( 'run-polished-assembly-report/polished_assembly_report.json') make_task = PypeTask( inputs={ "referenceset": referenceset_pfn, "gathered_alignmentset": alignmentset_pfn, "polished_fastq": gathered_fastq_pfn, }, outputs={ "report_json": polished_assembly_report_json_pfn, }, parameters=parameters, ) task = make_task(start_task.task_polished_assembly_report) wf.addTask(task) wf.refreshTargets() par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn)) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png')) sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality_thumb.png')) #return ############## if not os.path.exists('foo.bar1'): sys.system('touch foo.bar1') foo_fn1 = makePypeLocalFile('foo.bar1') foo_fn2 = makePypeLocalFile('foo.bar2') make_task = PypeTask( inputs={ "foo1": foo_fn1, }, outputs={ "foo2": foo_fn2, }, parameters=parameters, ) task = make_task(start_task.task_foo) wf.addTask(task) wf.refreshTargets()
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters): """Create a gc task for each chunk, plus a gathering task. Here is the convoluted workflow: 1. For each gc instance "chunk": A. variantCaller writes .fasta B. We create a contigset for the .fasta 2. We keep the contigset output filenames in a FOFN (from run_gc_scatter) and pass that to run_gc_gather(). 3. We read each contigset and add them to a gathered ContigSet. 4. We "consolidate" their underlying .fasta "resources", assuming their filenames match except extenion. 5. Finally, we write the gathered contigset. Whew! We also gather fastq here, for convenience. """ tasks = list() contigsets = dict() fastqs = dict() # Assume fofn of gc chunks are all relative to the dir of the fofn. for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()): alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)), alignmentset_bn) wdir = 'run-gc-{:02}'.format(i) mkdirs(wdir) # Assume CWD is correct. alignmentset_pfn = makePypeLocalFile( alignmentset_fn) # New pfn cuz it was not pfn before. polished_fastq_pfn = makePypeLocalFile( os.path.join(wdir, 'consensus.fastq')) variants_gff_pfn = makePypeLocalFile(os.path.join( wdir, 'variants.gff')) consensus_contigset_pfn = makePypeLocalFile( os.path.join(wdir, 'consensus.contigset.xml')) """Also produces: consensus.fasta consensus.fasta.fai And note that these files names are important, as pbcoretools gathering expects a particular pattern. """ contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn make_task = PypeTask( inputs={ "alignmentset": alignmentset_pfn, "referenceset": referenceset_pfn, }, outputs={ "polished_fastq": polished_fastq_pfn, "variants_gff": variants_gff_pfn, "consensus_contigset": consensus_contigset_pfn, }, parameters=parameters, ) task = make_task(start_task.task_genomic_consensus) tasks.append(task) contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml') gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq') inputs = dict(contigsets) inputs.update(fastqs) log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets))) make_task = PypeTask( inputs=inputs, outputs={ "ds_out": contigset_pfn, "fastq_out": gathered_fastq_pfn, }, parameters=parameters, ) task = make_task(start_task.task_gc_gather) tasks.append(task) return tasks, contigset_pfn, gathered_fastq_pfn
rank += 1 phased_reads = makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads")) for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "phased_reads" : phased_reads}, outputs = { "rawread_to_contig_file": rawread_to_contig_file }, TaskType = PypeThreadTaskBase, URL = "task://localhost/r_read_to_contigs.%s" % idx ) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg) wf.addTask( dump_rawread_to_ctg_task ) for las_key, las_file in all_pread_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx)) make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file, "pread_db": pread_db, "read_to_contig_map": read_to_contig_map,
def run( wf, config, input_config_fn, input_fofn_plf, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath('./0-rawreads') pread_dir = os.path.abspath('./1-preads_ovl') falcon_asm_dir = os.path.abspath('./2-asm-falcon') script_dir = os.path.abspath('./scripts') sge_log_dir = os.path.abspath('./sge_log') for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs wf.max_jobs = config['default_concurrent_jobs'] rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, 'raw-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask( inputs={'i_fofn': input_fofn_plf}, outputs={'o_fofn': rawread_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config['input_type'] == 'raw': #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, 'sleep_done')) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, 'rdb_build_done')) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, 'run_jobs.sh')) parameters = { 'work_dir': rawread_dir, 'sge_option': config['sge_option_da'], 'config_fn': input_config_fn, 'config': config } length_cutoff_plf = makePypeLocalFile( os.path.join(rawread_dir, 'length_cutoff')) raw_reads_db_plf = makePypeLocalFile( os.path.join(rawread_dir, '%s.db' % 'raw_reads')) make_build_rdb_task = PypeTask( inputs={'input_fofn': rawread_fofn_plf}, outputs={ 'rdb_build_done': rdb_build_done, 'raw_reads_db': raw_reads_db_plf, 'length_cutoff': length_cutoff_plf, 'run_jobs': run_jobs, }, parameters=parameters, ) build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner wf.max_jobs = config['da_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs={ 'run_jobs_fn': run_jobs, 'db_build_done': rdb_build_done, }, outputs={ 'scatter_fn': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'nblock': raw_reads_nblock, 'pread_aln': False, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, scattered_plf) wf.addTasks(daligner_tasks) r_gathered_las_plf = makePypeLocalFile( os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt')) parameters = { 'nblock': raw_reads_nblock, } make_daligner_gather = PypeTask( inputs=daligner_out, outputs={'gathered': r_gathered_las_plf}, parameters=parameters, ) check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. wf.max_jobs = config['la_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'run_jobs': run_jobs, 'gathered_las': r_gathered_las_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks( rawread_dir, scattered_plf) wf.addTasks(merge_tasks) task, _, las_fopfn_plf = create_merge_gather_task( os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'overlapping': sys.exit(0) # Produce new FOFN of preads fasta, based on consensus of overlaps. wf.max_jobs = config['cns_concurrent_jobs'] scattered_plf = os.path.join(rawread_dir, 'cns-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'gathered': las_fopfn_plf, 'db': raw_reads_db_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'raw_reads', 'config': config, }, ) task = make_task(pype_tasks.task_consensus_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) tasks, consensus_out = create_consensus_tasks(rawread_dir, scattered_plf) wf.addTasks(tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) task, preads_fofn_plf = create_consensus_gather_task( os.path.join(rawread_dir, 'preads'), consensus_out) wf.addTask(task) rdir = os.path.join(rawread_dir, 'report') pre_assembly_report_plf = makePypeLocalFile( os.path.join(rdir, 'pre_assembly_stats.json')) parameters = dict(config) parameters['cwd'] = rdir make_task = PypeTask( inputs={ 'length_cutoff_fn': length_cutoff_plf, 'raw_reads_db': raw_reads_db_plf, 'preads_fofn': preads_fofn_plf, }, outputs={ 'pre_assembly_report': pre_assembly_report_plf, }, parameters=parameters, ) task = make_task(pype_tasks.task_report_pre_assembly) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) if config['target'] == 'pre-assembly': log.info('Quitting after stage-0 for "pre-assembly" target.') sys.exit(0) # build pread database if config['input_type'] == 'preads': preads_fofn_plf = makePypeLocalFile( os.path.join(pread_dir, 'preads-fofn-abs', os.path.basename(config['input_fofn']))) make_fofn_abs_task = PypeTask( inputs={'i_fofn': rawread_fofn_plf}, outputs={'o_fofn': preads_fofn_plf}, parameters={}, ) fofn_abs_task = make_fofn_abs_task( pype_tasks.task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, 'pdb_build_done')) parameters = { 'work_dir': pread_dir, 'sge_option': config['sge_option_pda'], 'config_fn': input_config_fn, 'config': config } run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask( inputs={'preads_fofn': preads_fofn_plf}, outputs={ 'pdb_build_done': pdb_build_done, 'preads_db': preads_db, 'run_jobs': run_jobs, }, parameters=parameters, ) build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner wf.max_jobs = config['pda_concurrent_jobs'] config['sge_option_da'] = config['sge_option_pda'] scattered_plf = os.path.join(pread_dir, 'daligner-scatter', 'scattered.json') make_daligner_scatter = PypeTask( inputs={ 'run_jobs_fn': run_jobs, 'db_build_done': pdb_build_done, }, outputs={ 'scatter_fn': scattered_plf, }, parameters={ 'db_prefix': 'preads', 'nblock': preads_nblock, 'pread_aln': True, 'config': config, }, ) task = make_daligner_scatter(pype_tasks.task_daligner_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, scattered_plf) wf.addTasks(daligner_tasks) p_gathered_las_plf = makePypeLocalFile( os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt')) parameters = { 'nblock': preads_nblock, } make_daligner_gather = PypeTask( inputs=daligner_out, outputs={'gathered': p_gathered_las_plf}, parameters=parameters, ) check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Merge .las files. wf.max_jobs = config['pla_concurrent_jobs'] config['sge_option_la'] = config['sge_option_pla'] scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json') make_task = PypeTask( inputs={ 'run_jobs': run_jobs, 'gathered_las': p_gathered_las_plf, }, outputs={ 'scattered': scattered_plf, }, parameters={ 'db_prefix': 'preads', 'config': config, }, ) task = make_task(pype_tasks.task_merge_scatter) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir, scattered_plf) wf.addTasks(merge_tasks) task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task( os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las) wf.addTask(task) wf.refreshTargets(exitOnFailure=exitOnFailure) # Draft assembly (called 'fc_' for now) wf.max_jobs = config['fc_concurrent_jobs'] db2falcon_dir = os.path.join(pread_dir, 'db2falcon') db2falcon_done = makePypeLocalFile( os.path.join(db2falcon_dir, 'db2falcon_done')) preads4falcon_plf = makePypeLocalFile( os.path.join(db2falcon_dir, 'preads4falcon.fasta')) make_run_db2falcon = PypeTask( inputs={ 'las_fofn_plf': las_fofn_plf, 'preads_db': preads_db, }, outputs={ 'db2falcon_done': db2falcon_done, 'preads4falcon': preads4falcon_plf, }, parameters={ 'wd': db2falcon_dir, 'config': config, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, 'falcon_asm_done')) make_run_falcon_asm = PypeTask( inputs={ 'db2falcon_done': db2falcon_done, 'db_file': preads_db, 'preads4falcon': preads4falcon_plf, 'las_fofn': las_fofn_plf, }, outputs={'falcon_asm_done': falcon_asm_done}, parameters={ 'wd': falcon_asm_dir, 'config': config, 'pread_dir': pread_dir, 'sge_option': config['sge_option_fc'], }, ) wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
def unzip_all(config): unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs'] unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs'] wf = PypeProcWatcherWorkflow( max_jobs=unzip_blasr_concurrent_jobs, job_type=config['job_type'], job_queue=config.get('job_queue'), sge_option=config.get('sge_option'), watcher_type=config.get('pwatcher_type'), #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'), use_tmpdir=config.get('use_tmpdir'), ) ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list') falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done') wdir = os.path.abspath('./3-unzip/reads') parameters = { 'wd': wdir, 'config': config, 'sge_option': config['sge_track_reads'], } job_done = makePypeLocalFile( os.path.join(parameters['wd'], 'track_reads_done')) make_track_reads_task = PypeTask( inputs={'falcon_asm_done': falcon_asm_done}, outputs={ 'job_done': job_done, 'ctg_list_file': ctg_list_file }, parameters=parameters, wdir=wdir, ) track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ctg_ids = [] with open('./3-unzip/reads/ctg_list') as f: for row in f: row = row.strip() ctg_ids.append(row) aln1_outs = {} all_ctg_out = {} for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id)) #mkdir(wd) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile( os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id=ctg_id))) parameters = { 'job_uid': 'aln-' + ctg_id, 'wd': blasr_dir, 'config': config, 'ctg_id': ctg_id, 'sge_option': config['sge_blasr_aln'], } make_blasr_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'read_fasta': read_fasta }, outputs={ 'ctg_aln_out': ctg_aln_out, 'job_done': job_done }, parameters=parameters, ) blasr_task = make_blasr_task(task_run_blasr) aln1_outs[ctg_id] = (ctg_aln_out, job_done) wf.addTask(blasr_task) wf.refreshTargets() wf.max_jobs = unzip_phasing_concurrent_jobs for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id)) blasr_dir = os.path.join(wd, 'blasr') ctg_aln_out = makePypeLocalFile( os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id))) phasing_dir = os.path.join(wd, 'phasing') job_done = makePypeLocalFile( os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id=ctg_id))) rid_to_phase_out = makePypeLocalFile( os.path.join( wd, 'rid_to_phase.{ctg_id}'.format(ctg_id=ctg_id))) # TODO: ??? all_ctg_out['r2p.{ctg_id}'.format( ctg_id=ctg_id)] = rid_to_phase_out # implicit output? parameters = { 'job_uid': 'ha-' + ctg_id, 'wd': wd, 'config': config, 'ctg_id': ctg_id, 'sge_option': config['sge_phasing'], } make_phasing_task = PypeTask( inputs={ 'ref_fasta': ref_fasta, 'aln_bam': ctg_aln_out }, outputs={'job_done': job_done}, parameters=parameters, ) phasing_task = make_phasing_task(task_phasing) wf.addTask(phasing_task) wf.refreshTargets() hasm_wd = os.path.abspath('./3-unzip/1-hasm/') #mkdir(hasm_wd) rid_to_phase_all = makePypeLocalFile( os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all')) task = PypeTask( inputs=all_ctg_out, outputs={'rid_to_phase_all': rid_to_phase_all}, )(get_rid_to_phase_all) wf.addTask(task) parameters['wd'] = hasm_wd parameters['sge_option'] = config['sge_hasm'] job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done')) make_hasm_task = PypeTask( inputs={'rid_to_phase_all': rid_to_phase_all}, outputs={'job_done': job_done}, parameters=parameters, ) hasm_task = make_hasm_task(task_hasm) wf.addTask(hasm_task) wf.refreshTargets()
def main(argv=sys.argv): global LOG LOG = support.setup_logger(None) if len(sys.argv) < 2: print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment' sys.exit(1) config_fn = sys.argv[1] config_absbasedir = os.path.dirname(os.path.abspath(config_fn)) config = ConfigParser.ConfigParser() config.read(config_fn) job_type = 'SGE' if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = ' -pe smp 12 -q bigmem' if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = ' -pe smp 24 -q bigmem ' if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/' if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = 'input_bam.fofn' if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') if not os.path.isabs(input_bam_fofn): input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn) quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { 'job_type': job_type, 'sge_quiver': sge_quiver, 'sge_track_reads': sge_track_reads, 'input_bam_fofn': input_bam_fofn, 'smrt_bin': smrt_bin } LOG.info('config={}'.format(pprint.pformat(config))) #support.job_type = 'SGE' #tmp hack until we have a configuration parser wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, ) abscwd = os.path.abspath('.') parameters = { 'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'), 'config': config } hasm_done_plf = makePypeLocalFile( './3-unzip/1-hasm/hasm_done') # by convention track_reads_h_done_plf = makePypeLocalFile( os.path.join(parameters['wd'], 'track_reads_h_done')) make_track_reads_task = PypeTask( inputs={'hasm_done': hasm_done_plf}, outputs={'job_done': track_reads_h_done_plf}, parameters=parameters, ) track_reads_task = make_track_reads_task(task_track_reads) #sge_track_reads = config['sge_track_reads'] wf.addTask(track_reads_task) scattered_quiver_plf = makePypeLocalFile( '4-quiver/quiver_scatter/scattered.json') make_task = PypeTask( inputs={ 'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'), 'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'), 'track_reads_h_done': track_reads_h_done_plf, }, outputs={ 'scattered_quiver_json': scattered_quiver_plf, }, parameters={}, ) wf.addTask(make_task(task_scatter_quiver)) wf.refreshTargets() p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs( scattered_quiver_plf) gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt') gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt') gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done') mkdir('4-quiver/cns_gather') with open(fn(gathered_p_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) with open(fn(gathered_h_ctg_plf), 'w') as ifs: for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out): ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn)) make_task = PypeTask( inputs=job_done_plfs, outputs={ 'job_done': gather_done_plf, }, parameters={}, ) wf.addTask(make_task(task_gather_quiver)) wf.refreshTargets() cns_p_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fasta') cns_p_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_p_ctg.fastq') cns_h_ctg_fasta_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fasta') cns_h_ctg_fastq_plf = makePypeLocalFile( '4-quiver/cns_output/cns_h_ctg.fastq') zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done') make_task = PypeTask( inputs={ 'gathered_p_ctg': gathered_p_ctg_plf, 'gathered_h_ctg': gathered_h_ctg_plf, 'gather_done': gather_done_plf, }, outputs={ 'cns_p_ctg_fasta': cns_p_ctg_fasta_plf, 'cns_p_ctg_fastq': cns_p_ctg_fastq_plf, 'cns_h_ctg_fasta': cns_h_ctg_fasta_plf, 'cns_h_ctg_fastq': cns_h_ctg_fastq_plf, 'job_done': zcat_done_plf, }, ) wf.addTask(make_task(task_cns_zcat)) wf.refreshTargets()
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir samtools = args.samtools ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() wf = PypeProcWatcherWorkflow( max_jobs=1, ) bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") ) vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") ) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir parameters["samtools"] = samtools make_het_call_task = PypeTask( inputs = { "bam_file": bam_file }, outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters = parameters, ) (make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file }, outputs = { "atable_file": atable_file }, parameters = parameters, ) (generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") ) get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file }, outputs = { "phased_variant_file": phased_variant_file }, ) (get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") ) get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs = { "phased_read_file": phased_read_file }, parameters = {"ctg_id": ctg_id}, ) (get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()