def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, nblock, config, pread_aln=False): tasks = [] tasks_out = {} skip_checks = config.get('skip_checks') fc_run_logger.info('Skip LAcheck after daligner? {}'.format(skip_checks)) for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, nblock, pread_aln, skip_checks): run_dir = "job_%s" %job_uid cwd = os.path.join(wd, run_dir) job_done_fn = os.path.abspath(os.path.join(cwd, "job_%s_done" %job_uid)) job_done = makePypeLocalFile(job_done_fn) parameters = {"daligner_script": script, "cwd": cwd, "job_uid": job_uid, "config": config, "sge_option": config["sge_option_da"], "db_prefix": db_prefix} make_daligner_task = PypeTask(inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/d_%s_%s" %(job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out[ "ajob_%s" % job_uid ] = job_done return tasks, tasks_out
def create_consensus_tasks(basedir, scatter_fn): consensus_tasks = [] consensus_out = {} content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = int(parameters['job_id']) cns_label = 'cns_%05d' % int(p_id) wdir = os.path.join(basedir, 'preads', cns_label) make_c_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL=URL, wdir=wdir, ) c_task = make_c_task(pype_tasks.task_run_consensus) consensus_tasks.append(c_task) consensus_out['cjob_%d' % p_id] = outputs['out_file'] return consensus_tasks, consensus_out
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, config, pread_aln=False): tasks = [] tasks_out = {} for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, pread_aln): run_dir = "job_%s" % job_uid cwd = os.path.join(wd, run_dir) job_done_fn = os.path.abspath( os.path.join(cwd, "job_%s_done" % job_uid)) job_done = makePypeLocalFile(job_done_fn) parameters = { "daligner_script": script, "cwd": cwd, "job_uid": job_uid, "config": config, "db_prefix": db_prefix } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done return tasks, tasks_out
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] merge_out = {} p_ids_merge_job_done = [] # for consensus merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) for p_id, merge_script in merge_scripts: job_done = makePypeLocalFile( os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = { "merge_script": merge_script, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config } make_merge_task = PypeTask(inputs={"input_dep": input_dep}, outputs={"job_done": job_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/m_%05d_%s" % (p_id, db_prefix)) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) p_ids_merge_job_done.append((p_id, job_done)) return merge_tasks, merge_out, p_ids_merge_job_done
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done): consensus_tasks = [] consensus_out = {} # Unlike the merge tasks, consensus occurs in a single directory. rdir = os.path.join(wd, 'preads') mkdir(rdir) for p_id, job_done in p_ids_merge_job_done: out_file = makePypeLocalFile( os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile( os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = { "cwd": rdir, "job_id": p_id, "prefix": db_prefix, "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/ct_%05d" % p_id) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return consensus_tasks, consensus_out
def create_merge_tasks(basedir, scatter_fn): tasks = [] p_ids_merged_las = {} # for consensus content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] p_id = parameters['job_id'] #merge_script = parameters['merge_script'] #sge_option = parameters['sge_option'] wdir = os.path.join(basedir, 'm_%05d' % p_id) make_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL=URL, wdir=wdir, ) task = make_task(pype_tasks.task_run_las_merge) tasks.append(task) las_fn = task.outputs[ 'merged_las'] # these are relative, so we need the PypeLocalFiles p_ids_merged_las[p_id] = las_fn return tasks, p_ids_merged_las
def create_daligner_tasks(basedir, scatter_fn): tasks = [] tasks_out = {} content = json.loads(open(scatter_fn).read()) # array of descriptions for section in content: parameters = section['parameters'] inputs = section['inputs'] inputs['scatter_fn'] = scatter_fn outputs = section['outputs'] URL = section['URL'] job_uid = parameters['job_uid'] wdir = os.path.join(basedir, 'job_%s' % job_uid) make_daligner_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL=URL, wdir=wdir, ) daligner_task = make_daligner_task(pype_tasks.task_run_daligner) tasks.append(daligner_task) tasks_out['ajob_%s' % job_uid] = daligner_task.outputs[ 'job_done'] # these are relative, so we need the PypeLocalFiles return tasks, tasks_out
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done): consensus_tasks = [] consensus_out = {} fasta_plfs = [] for p_id, job_done in p_ids_merge_job_done: cns_label = 'cns_%05d' % p_id rdir = os.path.join(wd, 'preads', cns_label) mkdir(rdir) out_done = makePypeLocalFile( os.path.abspath("%s/%s_done" % (rdir, cns_label))) out_file = makePypeLocalFile( os.path.abspath("%s/%s.fasta" % (rdir, cns_label))) fasta_plfs.append(out_file) parameters = { "cwd": rdir, "job_id": p_id, "prefix": db_prefix, "sge_option": config["sge_option_cns"], "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/%s" % cns_label) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) #consensus_out["cjob_%d" % p_id] = out_done consensus_out["cjob_%d" % p_id] = out_file r_cns_done_plf = makePypeLocalFile(os.path.join(wd, 'preads', "cns_done")) preads_fofn_plf = makePypeLocalFile( os.path.join(wd, 'preads', "input_preads.fofn")) make_check_r_cns_task = PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done_plf, "preads_fofn": preads_fofn_plf }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") consensus_tasks.append(make_check_r_cns_task(check_r_cns_task)) return consensus_tasks, preads_fofn_plf
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False): import hashlib job_id = 0 tasks = [] tasks_out = {} nblock = 1 new_db = True if os.path.exists( os.path.join(wd, "%s.db" % db_prefix) ): with open( os.path.join(wd, "%s.db" % db_prefix) ) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": nblock = int(l[2]) new_db = False break for pid in xrange(1, nblock + 1): try: os.makedirs("%s/m_%05d" % (wd, pid)) except OSError: pass with open(os.path.join(wd, "run_jobs.sh")) as f : for l in f : l = l.strip() job_uid = hashlib.md5(l).hexdigest() job_uid = job_uid[:8] l = l.split() if l[0] == "daligner": try: os.makedirs(os.path.join( wd, "./job_%s" % job_uid)) except OSError: pass os.system("cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_uid, db_prefix, db_pr efix, db_prefix) ) job_done = makePypeLocalFile(os.path.abspath( "%s/job_%s/job_%s_done" % (wd, job_uid, job_uid) )) if pread_aln == True: l[0] = "daligner_p" parameters = {"daligner_cmd": " ".join(l), "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix} make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/d_%s_%s" % (job_uid, db_prefix) ) daligner_task = make_daligner_task ( run_daligner ) tasks.append( daligner_task ) tasks_out[ "ajob_%s" % job_uid ] = job_done job_id += 1 return tasks, tasks_out
def create_consensus_gather_task(wd, inputs): # Happens only in stage-0. preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn')) make_cns_gather_task = PypeTask( inputs=inputs, # consensus_out outputs={'preads_fofn': preads_fofn_plf}, TaskType=MyFakePypeThreadTaskBase, URL='task://localhost/cns_gather') task = make_cns_gather_task(pype_tasks.task_cns_gather) return task, preads_fofn_plf
def main(): lfn = 'logging-cfg.json' if os.path.exists(lfn): logging.config.dictConfig(json.load(open(lfn))) else: logging.basicConfig() logging.getLogger().setLevel(logging.NOTSET) try: import logging_tree logging_tree.printout() except ImportError: pass log.debug('DEBUG LOGGING ON') log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format( JOB_TYPE, SLEEP_S)) exitOnFailure = False concurrent_jobs = 2 #Workflow = pypeflow.controller.PypeThreadWorkflow Workflow = PypeProcWatcherWorkflow Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = Workflow(job_type=JOB_TYPE) par = dict(sleep_s=SLEEP_S) DIR = 'mytmp' makedirs(DIR) f0 = makePypeLocalFile('mytmp/f0') f1 = makePypeLocalFile('mytmp/f1') make_task = PypeTask( #inputs = {'f': f}, outputs={'f0': f0}, parameters=par, TaskType=MyFakePypeThreadTaskBase) task = make_task(taskrun0) wf.addTasks([task]) make_task = PypeTask(inputs={'f0': f0}, outputs={'f1': f1}, parameters=par, TaskType=MyFakePypeThreadTaskBase) task = make_task(taskrun1) wf.addTasks([task]) wf.refreshTargets([task])
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False): job_id = 0 tasks = [] tasks_out = {} nblock = get_nblock(fn(db_file)) xform_script = get_script_xformer(pread_aln) line_count = 0 job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix) for desc, bash in job_descs.iteritems(): #job_uid = hashlib.md5(bash).hexdigest() #job_uid = job_uid[:8] job_uid = '%08d' % line_count line_count += 1 support.make_dirs(os.path.join(wd, "./job_%s" % job_uid)) call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % ( wd, job_uid, db_prefix, db_prefix, db_prefix) rc = system(call) if rc: raise Exception("Failure in system call: %r -> %d" % (call, rc)) job_done = makePypeLocalFile( os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid))) bash = xform_script(bash) parameters = { "daligner_cmd": bash, "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done job_id += 1 return tasks, tasks_out
def create_merge_gather_task(wd, inputs): las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn')) las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn')) make_task = PypeTask( inputs=inputs, # p_ids_merged_las outputs={ 'las_fofn': las_fofn_plf, 'las_fopfn': las_fopfn_plf, }, TaskType=MyFakePypeThreadTaskBase, ) # URL = 'task://localhost/pmerge_gather') task = make_task(pype_tasks.task_merge_gather) return task, las_fofn_plf, las_fopfn_plf
"target_sa": t_sa } outputs = {"job_done": job_done} parameters = { "mapping_data_dir": mapping_data_dir, "q_sn": q_sn, "t_sn": t_sn, "config": config } #for testing #task_decorator = PypeTask(inputs = inputs, outputs = outputs, parameters = parameters, TaskType = PypeTaskBase ) #task() make_mapping_task = PypeTask( inputs=inputs, outputs=outputs, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/mapping_task_q%05d_t%05d" % (q_sn, t_sn)) mapping_task = make_mapping_task(blasr_align) wf.addTask(mapping_task) qf_out = os.path.join(mapping_data_dir, "qf%05d.m4" % q_sn) qf_out = makePypeLocalFile(qf_out) all_qf_out["qf_out_%s" % q_sn] = qf_out job_done = os.path.join(mapping_data_dir, "qf%05d_done" % q_sn) job_done = makePypeLocalFile(job_done) all_qf_out["qf_done_%s" % q_sn] = job_done parameters = {"mapping_data_dir": mapping_data_dir, "q_sn": q_sn}
def testDistributed(runmode, cleanup): logger.info("test start") baseDir = "." import random random.seed(1984) #PypeThreadWorkflow.setNumThreadAllowed(20,20) #wf = PypeThreadWorkflow() PypeMPWorkflow.setNumThreadAllowed(20,20) wf = PypeMPWorkflow() allTasks = [] for layer in range(5): fN = random.randint(3,7) fin = [None] * fN fout = [None] * fN fmut = [None] * fN for w in range(fN): fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) ) fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) ) fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) ) #wf.addObjects([fin[w], fout[w], fmut[w]]) for w in range(fN): inputDataObjs = {} outputDataObjs = {} mutableDataObjs = {} for i in range(5): inputDataObjs["infile%d" % i] = random.choice(fin) i = 0 for obj in random.sample(fmut,2): #mutableDataObjs["outfile%d" % i] = obj i += 1 outputDataObjs["outfile%d" % i] = fout[w] shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10" shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10" shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w) shfile = open(shellFileName, 'w') print(shellCmd, file=shfile) shfile.close() if runmode == "internal": def t1(self): runShellCmd(["sleep", "%d" % random.randint(0,20) ]) for of in self.outputDataObjs.values(): runShellCmd(["touch", of.localFileName]) task = PypeTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://internal/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( t1 ) elif runmode == "localshell": task = PypeShellTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://localshell/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) elif runmode == "sge": task = PypeSGETask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://sge/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) elif runmode == "mixed": #distributed = random.choice( (False, True) ) distributed = True if w % 3 == 0 else False task = PypeDistributibleTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://sge/task_l%d_w%d" % (layer, w), distributed=distributed, TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) wf.addTasks([task]) allTasks.append(task) for URL in wf._pypeObjects: prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"]) if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ] if len(prereqJobURLs) == 0: if cleanup == "1": os.system("echo start > %s" % wf._pypeObjects[URL].localFileName) pass wf.refreshTargets(allTasks) dotFile = open("test.dot","w") #print >>dotFile, wf.graphvizShortNameDot print(wf.graphvizDot, file=dotFile) dotFile.close() dotFile = open("test_short_name.dot","w") print(wf.graphvizShortNameDot, file=dotFile) dotFile.close() rdfFile = open("test.rdf","w") print(wf.RDFXML, file=rdfFile) rdfFile.close() if runmode != "internal": mkFile = open("test.mk","w") print(wf.makeFileStr, file=mkFile) mkFile.close()
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] Workflow = PypeProcWatcherWorkflow PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeProcWatcherWorkflow(job_type=config['job_type']) input_fofn_plf = makePypeLocalFile(config["input_fofn"]) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = {"work_dir": rawread_dir, "config": config} raw_reads_db = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/rda_check") check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks( rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) concurrent_jobs = config["cns_concurrent_jobs"] PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pda_check") check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask(inputs={ "p_merge_done": p_merge_done, }, outputs={"db2falcon_done": db2falcon_done}, parameters={ "wd": pread_dir, "config": config, }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/db2falcon") wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "db2falcon_done": db2falcon_done, "db_file": preads_db }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/falcon") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets()
def main(argv=sys.argv): global fc_run_logger fc_run_logger = support.setup_logger(None) if len(sys.argv) < 2: print "you need to provide a configuration file to specific a couple cluster running environment" sys.exit(1) config_fn = sys.argv[1] config = ConfigParser.ConfigParser() config.read(config_fn) job_type = "SGE" if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = " -pe smp 12 -q bigmem" if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = " -pe smp 24 -q bigmem " if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = "/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/" if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = "input_bam.fofn" if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { "job_type": job_type, "sge_quiver": sge_quiver, "sge_track_reads": sge_track_reads, "input_bam_fofn": input_bam_fofn, "smrt_bin": smrt_bin } support.job_type = "SGE" #tmp hack until we have a configuration parser ctg_ids = [] PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs, quiver_concurrent_jobs) wf = PypeThreadWorkflow() parameters = {"wd": os.path.abspath("."), "config": config} hasm_done = makePypeLocalFile("./3-unzip/1-hasm/hasm_done") job_done = makePypeLocalFile( os.path.join(parameters["wd"], "track_reads_h_done")) make_track_reads_task = PypeTask(inputs={"hasm_done": hasm_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/track_reads_h") track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ref_seq_data = {} p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa") ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "p" h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted(ref_seq_data.keys()) p_ctg_out = [] h_ctg_out = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split("-")[0] wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id) mkdir(wd) ref_fasta = makePypeLocalFile( os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id))) if os.path.exists(fn(read_sam)): if ctg_types[ctg_id] == "p": p_ctg_out.append((cns_fasta, cns_fastq)) if ctg_types[ctg_id] == "h": h_ctg_out.append((cns_fasta, cns_fastq)) if not os.path.exists(fn(ref_fasta)): with open(fn(ref_fasta), "w") as f: print >> f, ">" + ctg_id print >> f, sequence parameters = { "job_uid": "q-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_quiver_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_sam": read_sam }, outputs={ "cns_fasta": cns_fasta, "cns_fastq": cns_fastq, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id)) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) wf.refreshTargets() os.system("sleep 30") mkdir("./4-quiver/cns_output") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq") for cns_fasta, cns_fastq in sorted(p_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_p_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_p_ctg.fastq".format( cns_fastq=fn(cns_fastq))) os.system("rm ./4-quiver/cns_output/cns_h_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_h_ctg.fastq") for cns_fasta, cns_fastq in sorted(h_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_h_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_h_ctg.fastq".format( cns_fastq=fn(cns_fastq)))
with open("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id), "w") as merge_script: #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = {"merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config} make_merge_task = PypeTask( inputs = {"input_dep": input_dep}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) ) merge_task = make_merge_task ( run_merge_task ) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) out_file = makePypeLocalFile(os.path.abspath( "%s/preads/out.%05d.fa" % (wd, p_id) )) out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = {"cwd": os.path.join(wd, "preads" ), "job_id": p_id, "prefix": db_prefix,
with open(fn(ref_fasta), "w") as f: print >> f, ">" + ctg_id print >> f, sequence parameters = { "job_uid": "q-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_quiver_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_sam": read_sam }, outputs={ "cns_fasta": cns_fasta, "cns_fastq": cns_fastq, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id)) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) wf.refreshTargets() os.system("sleep 30") mkdir("./4-quiver/cns_output") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq")
with open("run_jobs.sh") as f: for l in f: l = l.strip().split() if l[0] == "daligner": try: os.makedirs("./job_%05d" % job_id) except OSError: pass os.system( "cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ." % (job_id, prefix, prefix, prefix)) job_done = makePypeLocalFile( os.path.abspath("./job_%05d/job_%05d_done" % (job_id, job_id))) parameters = { "daligner_cmd": " ".join(l), "cwd": os.path.join(os.getcwd(), "job_%05d" % job_id), "job_id": job_id } make_daligner_task = PypeTask( inputs={"db_file": db_file}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/mtask_%05d" % job_id) daligner_task = make_daligner_task(run_daligner) wf.addTask(daligner_task) job_id += 1 print job_id wf.refreshTargets(updateFreq=45) #all
def unzip_all(config): unzip_concurrent_jobs = config["unzip_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(unzip_concurrent_jobs, unzip_concurrent_jobs) wf = PypeThreadWorkflow() ctg_list_file = makePypeLocalFile("./3-unzip/reads/ctg_list") falcon_asm_done = makePypeLocalFile("./2-asm-falcon/falcon_asm_done") parameters = {"wd": os.path.abspath("."), "config": config} job_done = makePypeLocalFile( os.path.join(parameters["wd"], "track_reads_done")) make_track_reads_task = PypeTask( inputs={"falcon_asm_done": falcon_asm_done}, outputs={ "job_done": job_done, "ctg_list_file": ctg_list_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/track_reads") track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ctg_ids = [] with open("./3-unzip/reads/ctg_list") as f: for row in f: row = row.strip() ctg_ids.append(row) aln1_outs = {} all_ctg_out = {} for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( "./3-unzip/reads/{ctg_id}_ref.fa".format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( "./3-unzip/reads/{ctg_id}_reads.fa".format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), "./3-unzip/0-phasing/{ctg_id}/".format(ctg_id=ctg_id)) mkdir(wd) ctg_aln_out = makePypeLocalFile( os.path.join(wd, "{ctg_id}_sorted.bam".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "aln_{ctg_id}_done".format(ctg_id=ctg_id))) parameters = { "job_uid": "aln-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_blasr_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_fasta": read_fasta }, outputs={ "ctg_aln_out": ctg_aln_out, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/aln_{ctg_id}".format(ctg_id=ctg_id)) blasr_task = make_blasr_task(task_run_blasr) aln1_outs[ctg_id] = (ctg_aln_out, job_done) wf.addTask(blasr_task) job_done = makePypeLocalFile( os.path.join(wd, "p_{ctg_id}_done".format(ctg_id=ctg_id))) rid_to_phase_out = makePypeLocalFile( os.path.join(wd, "rid_to_phase.{ctg_id}".format(ctg_id=ctg_id))) all_ctg_out["r2p.{ctg_id}".format(ctg_id=ctg_id)] = rid_to_phase_out parameters = { "job_uid": "ha-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_phasing_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "aln_bam": ctg_aln_out }, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/p_{ctg_id}".format(ctg_id=ctg_id)) phasing_task = make_phasing_task(task_phasing) wf.addTask(phasing_task) wf.refreshTargets() hasm_wd = os.path.abspath("./3-unzip/1-hasm/") mkdir(hasm_wd) rid_to_phase_all = makePypeLocalFile( os.path.join(hasm_wd, "rid_to_phase.all")) @PypeTask(inputs=all_ctg_out, outputs={"rid_to_phase_all": rid_to_phase_all}, TaskType=PypeThreadTaskBase, URL="task://localhost/rid_to_phase_all") def get_rid_to_phase_all(self): rid_to_phase_all_fn = fn(self.rid_to_phase_all) inputs_fn = [fn(f) for f in self.inputs.values()] inputs_fn.sort() output = [] for fname in inputs_fn: output.extend(open(fname).read()) out = open(rid_to_phase_all_fn, "w") out.write("".join(output)) out.close() wf.addTask(get_rid_to_phase_all) parameters["wd"] = hasm_wd job_done = makePypeLocalFile(os.path.join(hasm_wd, "hasm_done")) make_hasm_task = PypeTask(inputs={"rid_to_phase_all": rid_to_phase_all}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/hasm") hasm_task = make_hasm_task(task_hasm) wf.addTask(hasm_task) wf.refreshTargets()
config = get_config(sys.argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath(config["input_fofn_fn"])) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask( inputs={"input_fofn": input_h5_fofn}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks)
def run( wf, config, input_fofn_plf, setNumThreadAllowed, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = { "work_dir": rawread_dir, "sge_option": config["sge_option_da"], "config": config } length_cutoff_plf = makePypeLocalFile( os.path.join(rawread_dir, "length_cutoff")) raw_reads_db_plf = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db_plf, "length_cutoff": length_cutoff_plf, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/rda_check") check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks( rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) pre_assembly_report_plf = makePypeLocalFile( os.path.join(rawread_dir, "pre_assembly_stats.json") ) #tho technically it needs pread_fofn make_task = PypeTask(inputs={ "length_cutoff_fn": length_cutoff_plf, "raw_reads_db": raw_reads_db_plf, "preads_fofn": pread_fofn, }, outputs={ "pre_assembly_report": pre_assembly_report_plf, }, parameters=config, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/report_pre_assembly") task = make_task(task_report_pre_assembly) wf.addTask(task) concurrent_jobs = config["cns_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": log.info("Quitting after stage-0 for 'pre-assembly' target.") sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = { "work_dir": pread_dir, "sge_option": config["sge_option_pda"], "config": config } run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pda_check") check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) config["sge_option_la"] = config["sge_option_pla"] merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask(inputs={ "p_merge_done": p_merge_done, }, outputs={"db2falcon_done": db2falcon_done}, parameters={ "wd": pread_dir, "config": config, "sge_option": config["sge_option_fc"], }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/db2falcon") wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "db2falcon_done": db2falcon_done, "db_file": preads_db }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir, "sge_option": config["sge_option_fc"], }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/falcon_asm") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets(updateFreq=wait_time) #all
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] consensus_tasks = [] merge_out = {} consensus_out = {} mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) #mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/m_%05d" % (wd, p_id)) support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_script_file = os.path.abspath("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id)) with open(merge_script_file, "w") as merge_script: #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % ( p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % ( p_id, db_prefix, p_id) job_done = makePypeLocalFile( os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = { "merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config } make_merge_task = PypeTask(inputs={"input_dep": input_dep}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/m_%05d_%s" % (p_id, db_prefix)) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) out_file = makePypeLocalFile( os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile( os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = { "cwd": os.path.join(wd, "preads"), "job_id": p_id, "prefix": db_prefix, "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/ct_%05d" % p_id) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return merge_tasks, merge_out, consensus_tasks, consensus_out
def main1(prog_name, input_config_fn, logger_config_fn=None): setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = get_config(parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask( inputs={"input_fofn": rawread_fofn_plf}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={"pdb_build_done": pdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) @PypeTask(inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "run_falcon_asm.sh") script = [] script.append("set -vex") script.append("trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done)) script.append("source {install_prefix}/bin/activate".format( install_prefix=install_prefix)) script.append("cd %s" % pread_dir) # Write preads4falcon.fasta, in 1-preads_ovl: script.append("DB2Falcon -U preads") script.append("cd %s" % wd) script.append("""find %s/las_files -name "*.las" > las.fofn """ % pread_dir) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append("ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append("""fc_graph_to_contig.py""") script.append("""touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid4())[:8] job_data = { "job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type=config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_name) wf.addTask(run_falcon_asm_task) wf.refreshTargets(updateFreq=wait_time) #all
def phasing(args): bam_fn = args.bam fasta_fn = args.fasta ctg_id = args.ctg_id base_dir = args.base_dir ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() PypeThreadWorkflow.setNumThreadAllowed(1, 1) wf = PypeThreadWorkflow() bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_map")) vpos_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "variant_pos")) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map")) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs={"bam_file": bam_file}, outputs={ "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/het_call")(make_het_call) wf.addTasks([make_het_call_task]) atable_file = makePypeLocalFile(os.path.join(base_dir, ctg_id, "atable")) parameters = {} parameters["ctg_id"] = ctg_id parameters["base_dir"] = base_dir generate_association_table_task = PypeTask( inputs={"vmap_file": vmap_file}, outputs={"atable_file": atable_file}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/g_atable")(generate_association_table) wf.addTasks([generate_association_table_task]) phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_variants")) get_phased_blocks_task = PypeTask( inputs={ "vmap_file": vmap_file, "atable_file": atable_file }, outputs={"phased_variant_file": phased_variant_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_blocks")(get_phased_blocks) wf.addTasks([get_phased_blocks_task]) phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads")) get_phased_reads_task = PypeTask( inputs={ "vmap_file": vmap_file, "q_id_map_file": q_id_map_file, "phased_variant_file": phased_variant_file }, outputs={"phased_read_file": phased_read_file}, parameters={"ctg_id": ctg_id}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_phased_reads")(get_phased_reads) wf.addTasks([get_phased_reads_task]) wf.refreshTargets()
rank += 1 phased_reads = makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads")) for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "phased_reads" : phased_reads}, outputs = { "rawread_to_contig_file": rawread_to_contig_file }, TaskType = PypeThreadTaskBase, URL = "task://localhost/r_read_to_contigs.%s" % idx ) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg) wf.addTask( dump_rawread_to_ctg_task ) for las_key, las_file in all_pread_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx)) make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file, "pread_db": pread_db, "read_to_contig_map": read_to_contig_map,
except OSError: pass with open("./p_%05d/p_%05d.sh" % (p_id, p_id), "w") as p_script: print >> p_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (prefix, p_id, prefix) for l in s_data: print >> p_script, l print >> p_script, "mv %s.%d.las ../las_files" % (prefix, p_id) p_file = os.path.abspath( "./p_%05d/p_%05d.sh" % (p_id, p_id) ) job_done = makePypeLocalFile(os.path.abspath( "./p_%05d/p_%05d_done" % (p_id,p_id) )) parameters = {"p_file": p_file, "cwd": os.path.join(os.getcwd(), "p_%05d" % p_id), "job_id": p_id} make_p_task = PypeTask( inputs = {"db_file": db_file}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ptask_%05d" % p_id ) p_task = make_p_task ( run_p_task ) wf.addTask(p_task) out_file = makePypeLocalFile(os.path.abspath( "./preads/out.%04d.fa" % p_id )) parameters = {"cwd": os.path.join(os.getcwd(), "preads" ), "job_id": p_id} make_c_task = PypeTask( inputs = {"job_done": job_done}, outputs = {"out_file": out_file }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ct_%05d" % p_id )
def generate_read_to_contig_map(rawread_dir=rawread_dir, pread_dir=pread_dir, asm_dir=asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db")) rawread_id_file = makePypeLocalFile( os.path.join(rawread_dir, "raw_read_ids")) @PypeTask(inputs={"rawread_db": rawread_db}, outputs={"rawread_id_file": rawread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_rawread_ids") def dump_rawread_ids(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file)) wf.addTask(dump_rawread_ids) pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db")) pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids")) @PypeTask(inputs={"pread_db": pread_db}, outputs={"pread_id_file": pread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_pread_ids") def dump_pread_ids(self): pread_db = fn(self.pread_db) pread_id_file = fn(self.pread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file)) wf.addTask(dump_pread_ids) all_raw_las_files = {} for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")): idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_raw_las_files["r_las_%s" % idx] = las_file all_pread_las_files = {} for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")): idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_pread_las_files["p_las_%s" % idx] = las_file wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list")) utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data")) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths")) inputs = { "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "sg_edges_list": sg_edges_list, "utg_data": utg_data, "ctg_paths": ctg_paths } read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map")) @PypeTask(inputs=inputs, outputs={"read_to_contig_map": read_to_contig_map}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_ctg_read_map") def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, "w") as f: for ctg in asm_G.ctg_data: if ctg[-1] == "R": continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(":")[0]) rid = pread_did_to_rid[pid].split("/")[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg) wf.addTask(generate_read_to_ctg_map) def dump_rawread_to_ctg(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) #pread_id_file = fn( self.pread_id_file ) las_file = fn(self.las_file) rawread_to_contig_file = fn(self.rawread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) rid_to_oid = open(rawread_id_file).read().split("\n") #pread_did_to_rid = open(pread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None rid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row rid = int(rid) rid_to_contigs.setdefault(rid, (oid, set())) rid_to_contigs[rid][1].add(ctg) with open(rawread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None for row in sp.check_output( shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: o_id = rid_to_oid[cur_read_id] print >> f, "%09d %s %s %d %d %d %d" % ( cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id if q_id in rid_to_contigs and len( ovlp_data) == 0: #if the query is in some contig.... t_o_id, ctgs = rid_to_contigs[q_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 if t_id not in rid_to_contigs: continue t_o_id, ctgs = rid_to_contigs[t_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 def dump_pread_to_ctg(self): pread_db = fn(self.pread_db) rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) las_file = fn(self.las_file) pread_to_contig_file = fn(self.pread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) pid_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None pid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row pid = int(pid) pid_to_contigs.setdefault(pid, (oid, set())) pid_to_contigs[pid][1].add(ctg) with open(pread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None skip_rest = 0 for row in sp.check_output( shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: rid = pid_to_rid[cur_read_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] print >> f, "%09d %s %s %d %d %d %d" % ( cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id skip_rest = 0 if q_id in pid_to_contigs and len( ovlp_data) == 0: #if the query is in some contig.... t_o_id, ctgs = pid_to_contigs[q_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 skip_rest = 1 if skip_rest == 1: continue if t_id not in pid_to_contigs: continue t_o_id, ctgs = pid_to_contigs[t_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile( os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs={ "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file }, outputs={"rawread_to_contig_file": rawread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/r_read_to_contigs.%s" % idx) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg( dump_rawread_to_ctg) wf.addTask(dump_rawread_to_ctg_task) for las_key, las_file in all_pread_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) pread_to_contig_file = makePypeLocalFile( os.path.join(read_map_dir, "pread_to_contigs.%s" % idx)) make_dump_pread_to_ctg = PypeTask( inputs={ "las_file": las_file, "pread_db": pread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file }, outputs={"pread_to_contig_file": pread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/pread_to_contigs.%s" % idx) dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg) wf.addTask(dump_pread_to_ctg_task) wf.refreshTargets() # block