if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase,
outputs = {"fasta_dump_done": fasta_dump_done, "target_fa_fofn": target_fa_fofn, "query_fa_fofn": query_fa_fofn}, parameters = parameters, TaskType = PypeThreadTaskBase) def h5fofn_to_fasta(self): os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\ (fn(self.input_fofn), self.parameters["fasta_dir"], self.parameters["min_length"], self.parameters["min_read_score"])) os.system("""find %s -name "*_t.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.target_fa_fofn))) os.system("""find %s -name "*_q.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.query_fa_fofn))) os.system("touch %s" % fn(self.fasta_dump_done)) wf.addTasks([h5fofn_to_fasta]) #we need to force the execution of the graph at this point to ensure generating correct downstream graph wf.refreshTargets([fasta_dump_done]) #### Task to split the fofn file into small chunks for parallel processing split_fofn_done = makePypeLocalFile(os.path.abspath( os.path.join( dist_map_dir, "split_fofn_done") ) ) @PypeTask(inputs = {"target_fa_fofn": target_fa_fofn, "query_fa_fofn": query_fa_fofn}, outputs = {"split_fofn_done": split_fofn_done}, parameters = {"config":config, "dist_map_dir": dist_map_dir}, TaskType = PypeThreadTaskBase) def split_fofn_task(self): query_chunk_size = self.parameters["config"]["q_chunk_size"] target_chunk_size = self.parameters["config"]["t_chunk_size"]
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(config["input_fofn"]) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "config": config} raw_reads_db = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "raw_reads.db": raw_reads_db, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs = {"pread_fofn": pread_fofn }, outputs = {"pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done")) @PypeTask( inputs = merge_out, outputs = {"p_merge_done": p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask( inputs = {"p_merge_done": p_merge_done,}, outputs = {"db2falcon_done": db2falcon_done}, parameters = {"wd": pread_dir, "config": config, }, TaskType = PypeThreadTaskBase, URL = "task://localhost/db2falcon" ) wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db}, outputs = {"falcon_asm_done": falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets()
}, parameters=parameters, TaskType=PypeThreadTaskBase) def h5fofn_to_fasta(self): os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\ (fn(self.input_fofn), self.parameters["fasta_dir"], self.parameters["min_length"], self.parameters["min_read_score"])) os.system("""find %s -name "*_t.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.target_fa_fofn))) os.system("""find %s -name "*_q.fa" | sort > %s""" % (self.parameters["fasta_dir"], fn(self.query_fa_fofn))) os.system("touch %s" % fn(self.fasta_dump_done)) wf.addTasks([h5fofn_to_fasta]) #we need to force the execution of the graph at this point to ensure generating correct downstream graph wf.refreshTargets([fasta_dump_done]) #### Task to split the fofn file into small chunks for parallel processing split_fofn_done = makePypeLocalFile( os.path.abspath(os.path.join(dist_map_dir, "split_fofn_done"))) @PypeTask(inputs={ "target_fa_fofn": target_fa_fofn, "query_fa_fofn": query_fa_fofn }, outputs={"split_fofn_done": split_fofn_done}, parameters={ "config": config, "dist_map_dir": dist_map_dir
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) config = support.get_config(support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) make_build_pdb_task = PypeTask(inputs = { "pread_fofn": pread_fofn }, outputs = { "pdb_build_done": pdb_build_done, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"p_merge_done": p_merge_done, "db_file":db_file}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets(updateFreq = wait_time) #all
ch.setFormatter(formatter) logger.addHandler(ch) inputs = {"input": makePypeLocalFile("/tmp/test1_input")} outputs = {"output": makePypeLocalFile("/tmp/test1_output")} os.system("touch /tmp/test1_input") @PypeTask(inputs = inputs, outputs = outputs, TaskType = PypeThreadTaskBase) def f(self): i = 0 while 1: time.sleep(0.1) if self.shutdown_event != None and self.shutdown_event.is_set(): break if i > 10: break i += 1 if self.shutdown_event == None or not self.shutdown_event.is_set(): os.system("touch %s" % fn(self.output)) wf = PypeThreadWorkflow() wf.addTasks([f]) wf.refreshTargets()
def main1(prog_name, input_config_fn, logger_config_fn=None): setup_logger(logger_config_fn) fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) config = get_config(parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config ) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} make_build_pdb_task = PypeTask( inputs = { "pread_fofn": pread_fofn }, outputs = { "pdb_build_done": pdb_build_done }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config ) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) @PypeTask( inputs = {"p_merge_done": p_merge_done, "db_file":db_file}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "set -vex" ) script.append( "trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done) ) script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) # Write preads4falcon.fasta, in 1-preads_ovl: script.append( "DB2Falcon -U preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = config["sge_option_fc"] run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_data['job_name']) wf.addTask( run_falcon_asm_task ) wf.refreshTargets(updateFreq = wait_time) #all
def main(*argv): setup_logger() if len(argv) < 2: print "you need to specify a configuration file" print "example: HGAP.py HGAP_run.cfg" sys.exit(1) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config ) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fa" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": exit(0) if config["input_type"] == "preads": if not os.path.exists( "%s/input_preads.fofn" % pread_dir): os.system( "cp %s %s/input_preads.fofn" % (os.path.abspath( config["input_fofn_fn"] ), pread_dir) ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) rdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "rdb_build_done") ) @PypeTask( inputs = { "pread_fofn": pread_fofn }, outputs = { "rdb_build_done": rdb_build_done }, parameters = {"config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_p_rdb") def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] fa_serial = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() c = 0 fa_serial += 1 with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm: f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_","") ignore_read = False for cc in r.sequence: if cc not in ["A","C","G","T"]: ignore_read = True break if ignore_read: continue print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) ) for i in range(0, len(r.sequence)/80): print >> p_norm, r.sequence[ i *80 : (i + 1) * 80] print >> p_norm, r.sequence[(i+1)*80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) ) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir) wf.addTask(build_p_rdb_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, rdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config ) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) @PypeTask( inputs = {"p_merge_done": p_merge_done}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) script.append( "DB2Falcon preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --fofn las.fofn %s \ --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl > fc.log""" ) script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s\n""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name ) wf.addTask( run_falcon_asm_task ) wf.refreshTargets(updateFreq = wait_time) #all
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets(updateFreq=wait_time) #all
#### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath(config["input_fofn_fn"])) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask( inputs={"input_fofn": input_h5_fofn}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done},
def main1(prog_name, input_config_fn, logger_config_fn=None): setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = get_config(parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask( inputs={"input_fofn": rawread_fofn_plf}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={"pdb_build_done": pdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) @PypeTask(inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "run_falcon_asm.sh") script = [] script.append("set -vex") script.append("trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done)) script.append("source {install_prefix}/bin/activate".format( install_prefix=install_prefix)) script.append("cd %s" % pread_dir) # Write preads4falcon.fasta, in 1-preads_ovl: script.append("DB2Falcon -U preads") script.append("cd %s" % wd) script.append("""find %s/las_files -name "*.las" > las.fofn """ % pread_dir) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append("ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append("""fc_graph_to_contig.py""") script.append("""touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid4())[:8] job_data = { "job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type=config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_name) wf.addTask(run_falcon_asm_task) wf.refreshTargets(updateFreq=wait_time) #all
bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_map") ) vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_pos") ) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs = { "bam_file": bam_file }, outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/het_call") (make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "atable") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file }, outputs = { "atable_file": atable_file }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/g_atable") (generate_association_table) wf.addTasks([generate_association_table_task])
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() PypeThreadWorkflow.setNumThreadAllowed(1, 1) wf = PypeThreadWorkflow() bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_map")) vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_pos")) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map")) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs={"bam_file": bam_file}, outputs={ "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/het_call")(make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "atable")) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs={"vmap_file": vmap_file}, outputs={"atable_file": atable_file}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/g_atable")(generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_variants")) get_phased_blocks_task = PypeTask( inputs={ "vmap_file": vmap_file, "atable_file": atable_file }, outputs={"phased_variant_file": phased_variant_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_blocks")(get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads")) get_phased_reads_task = PypeTask( inputs={ "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs={"phased_read_file": phased_read_file}, parameters={"ctg_id": ctg_id}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_reads")(get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) try: config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn)) except Exception: fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn)) raise rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(config["input_fofn"]) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "config": config} raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db_plf, "run_jobs": run_jobs, }, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, nblock=raw_reads_nblock, config=config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, "length_cutoff")) pre_assembly_report_plf = makePypeLocalFile(os.path.join(rawread_dir, "pre_assembly_stats.json")) #tho technically it needs pread_fofn make_task = PypeTask( inputs = {"length_cutoff_fn": length_cutoff_plf, "raw_reads_db": raw_reads_db_plf, "preads_fofn": pread_fofn, }, outputs = {"pre_assembly_report": pre_assembly_report_plf, }, parameters = config, TaskType = PypeThreadTaskBase, URL = "task://localhost/report_pre_assembly") task = make_task(task_report_pre_assembly) wf.addTask(task) concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs = {"pread_fofn": pread_fofn }, outputs = {"pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, nblock=preads_nblock, config=config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done")) @PypeTask( inputs = merge_out, outputs = {"p_merge_done": p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask( inputs = {"p_merge_done": p_merge_done,}, outputs = {"db2falcon_done": db2falcon_done}, parameters = {"wd": pread_dir, "config": config, }, TaskType = PypeThreadTaskBase, URL = "task://localhost/db2falcon" ) wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db}, outputs = {"falcon_asm_done": falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets()
parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs={"bam_file": bam_file}, outputs={ "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/het_call")(make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "atable")) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs={"vmap_file": vmap_file}, outputs={"atable_file": atable_file}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/g_atable")(generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile(
def main(*argv): setup_logger() if len(argv) < 2: print "you need to specify a configuration file" print "example: HGAP.py HGAP_run.cfg" sys.exit(1) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath(config["input_fofn_fn"])) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask( inputs={"input_fofn": input_h5_fofn}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fa" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": exit(0) if config["input_type"] == "preads": if not os.path.exists("%s/input_preads.fofn" % pread_dir): os.system("cp %s %s/input_preads.fofn" % (os.path.abspath(config["input_fofn_fn"]), pread_dir)) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) rdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "rdb_build_done")) @PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={"rdb_build_done": rdb_build_done}, parameters={ "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/build_p_rdb") def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] fa_serial = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() c = 0 fa_serial += 1 with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm: f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_", "") ignore_read = False for cc in r.sequence: if cc not in ["A", "C", "G", "T"]: ignore_read = True break if ignore_read: continue print >> p_norm, ">prolog_%05d/%d/%d_%d" % ( fa_serial, c, 0, len(r.sequence)) for i in range(0, len(r.sequence) / 80): print >> p_norm, r.sequence[i * 80:(i + 1) * 80] print >> p_norm, r.sequence[(i + 1) * 80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial)) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir) wf.addTask(build_p_rdb_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, "preads", db_file, rdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) @PypeTask(inputs={"p_merge_done": p_merge_done}, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "run_falcon_asm.sh") script = [] script.append("source {install_prefix}/bin/activate".format( install_prefix=install_prefix)) script.append("cd %s" % pread_dir) script.append("DB2Falcon preads") script.append("cd %s" % wd) script.append("""find %s/las_files -name "*.las" > las.fofn """ % pread_dir) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append("""fc_ovlp_filter.py --fofn las.fofn %s \ --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr)) script.append("ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append("""fc_ovlp_to_graph.py preads.ovl > fc.log""") script.append("""fc_graph_to_contig.py""") script.append("""touch %s\n""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid1())[:8] job_data = { "job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type="SGE") wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_name) wf.addTask(run_falcon_asm_task) wf.refreshTargets(updateFreq=wait_time) #all