def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done): consensus_tasks = [] consensus_out = {} # Unlike the merge tasks, consensus occurs in a single directory. rdir = os.path.join(wd, 'preads') mkdir(rdir) for p_id, job_done in p_ids_merge_job_done: out_file = makePypeLocalFile( os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile( os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = { "cwd": rdir, "job_id": p_id, "prefix": db_prefix, "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/ct_%05d" % p_id) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return consensus_tasks, consensus_out
def simpleTest2(): wf = PypeWorkflow() f1 = makePypeLocalFile("test.fa") f2 = makePypeLocalFile("ref.fa") f3 = makePypeLocalFile("aln.txt", readOnly=False) f4 = makePypeLocalFile("aln2.txt", readOnly=False) os.system("touch %s" % f1.localFileName) os.system("touch %s" % f2.localFileName) @PypeTask(inputDataObjs={ "fasta": f1, "ref": f2 }, outputDataObjs={"aln": f3}, parameters={"a": 10}, **{"b": 12}) def testTask(*argv, **kwargv): print("testTask is running") for ft, f in testTask.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) runShellCmd(["sleep", "5"]) @PypeTask(inputDataObjs={ "fasta": f1, "aln": f3 }, outputDataObjs={"aln2": f4}, parameters={"a": 10}, **{"b": 12}) def testTask2(*argv, **kwargv): print("testTask2 is running") for ft, f in testTask2.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2]) wf.addTasks([testTask, testTask2]) print(wf.RDFXML) print(wf.graphvizDot) #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort()) wf.refreshTargets([f4]) print("re-touch f1") os.system("sleep 1;touch %s;" % f1.localFileName) wf.refreshTargets([f4]) print("re-touch f3") os.system("sleep 1;touch %s;" % f3.localFileName)
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done): consensus_tasks = [] consensus_out = {} fasta_plfs = [] for p_id, job_done in p_ids_merge_job_done: cns_label = 'cns_%05d' % p_id rdir = os.path.join(wd, 'preads', cns_label) mkdir(rdir) out_done = makePypeLocalFile( os.path.abspath("%s/%s_done" % (rdir, cns_label))) out_file = makePypeLocalFile( os.path.abspath("%s/%s.fasta" % (rdir, cns_label))) fasta_plfs.append(out_file) parameters = { "cwd": rdir, "job_id": p_id, "prefix": db_prefix, "sge_option": config["sge_option_cns"], "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/%s" % cns_label) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done r_cns_done_plf = makePypeLocalFile(os.path.join(wd, 'preads', "cns_done")) pread_fofn_plf = makePypeLocalFile( os.path.join(wd, 'preads', "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done_plf, "pread_fofn": pread_fofn_plf }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: for fa_fn in sorted(fn(plf) for plf in fasta_plfs): print >> f, fa_fn system("touch %s" % fn(self.cns_done)) consensus_tasks.append(check_r_cns_task) return consensus_tasks, pread_fofn_plf
def create_merge_gather_task(wd, inputs): las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn')) las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn')) make_task = PypeTask( inputs=inputs, # p_ids_merged_las outputs={ 'las_fofn': las_fofn_plf, 'las_fopfn': las_fopfn_plf, }, TaskType=MyFakePypeThreadTaskBase, ) # URL = 'task://localhost/pmerge_gather') task = make_task(pype_tasks.task_merge_gather) return task, las_fofn_plf, las_fopfn_plf
def simpleTest2(): wf = PypeWorkflow() f1 = makePypeLocalFile("test.fa") f2 = makePypeLocalFile("ref.fa") f3 = makePypeLocalFile("aln.txt", readOnly=False) f4 = makePypeLocalFile("aln2.txt", readOnly=False) os.system("touch %s" % f1.localFileName) os.system("touch %s" % f2.localFileName) @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, outputDataObjs={"aln":f3}, parameters={"a":10}, **{"b":12}) def testTask(*argv, **kwargv): print("testTask is running") for ft, f in testTask.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) runShellCmd(["sleep", "5" ]) @PypeTask(inputDataObjs={"fasta":f1, "aln":f3}, outputDataObjs={"aln2":f4}, parameters={"a":10}, **{"b":12}) def testTask2(*argv, **kwargv): print("testTask2 is running") for ft, f in testTask2.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2]) wf.addTasks([testTask, testTask2]) print (wf.RDFXML) print (wf.graphvizDot) #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort()) wf.refreshTargets([f4]) print("re-touch f1") os.system("sleep 1;touch %s;" % f1.localFileName) wf.refreshTargets([f4]) print("re-touch f3") os.system("sleep 1;touch %s;" % f3.localFileName)
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False): job_id = 0 tasks = [] tasks_out = {} nblock = 1 new_db = True if os.path.exists(fn(db_file)): with open(fn(db_file)) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": nblock = int(l[2]) new_db = False break for pid in xrange(1, nblock + 1): support.make_dirs("%s/m_%05d" % (wd, pid)) with open(os.path.join(wd, "run_jobs.sh")) as f: for l in f: l = l.strip() job_uid = hashlib.md5(l).hexdigest() job_uid = job_uid[:8] l = l.split() if l[0] == "daligner": support.make_dirs(os.path.join(wd, "./job_%s" % job_uid)) call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % ( wd, job_uid, db_prefix, db_prefix, db_prefix, ) rc = os.system(call) if rc: raise Exception("Failure in system call: %r -> %d" % (call, rc)) job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid))) if pread_aln == True: l[0] = "daligner_p" parameters = { "daligner_cmd": " ".join(l), "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix, } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix), ) daligner_task = make_daligner_task(run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done job_id += 1 return tasks, tasks_out
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, nblock, config, pread_aln=False): tasks = [] tasks_out = {} skip_checks = config.get('skip_checks') fc_run_logger.info('Skip LAcheck after daligner? {}'.format(skip_checks)) for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, nblock, pread_aln, skip_checks): run_dir = "job_%s" %job_uid cwd = os.path.join(wd, run_dir) job_done_fn = os.path.abspath(os.path.join(cwd, "job_%s_done" %job_uid)) job_done = makePypeLocalFile(job_done_fn) parameters = {"daligner_script": script, "cwd": cwd, "job_uid": job_uid, "config": config, "sge_option": config["sge_option_da"], "db_prefix": db_prefix} make_daligner_task = PypeTask(inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/d_%s_%s" %(job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out[ "ajob_%s" % job_uid ] = job_done return tasks, tasks_out
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) try: config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) except Exception: fc_run_logger.exception( 'Failed to parse config "{}".'.format(input_config_fn)) raise input_fofn_plf = makePypeLocalFile(config["input_fofn"]) #Workflow = PypeProcWatcherWorkflow wf = PypeProcWatcherWorkflow( job_type=config['job_type'], job_queue=config['job_queue'], sge_option=config.get('sge_option', ''), watcher_type=config['pwatcher_type'], watcher_directory=config['pwatcher_directory']) run(wf, config, os.path.abspath(input_config_fn), input_fofn_plf=input_fofn_plf, setNumThreadAllowed=PypeProcWatcherWorkflow.setNumThreadAllowed)
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, config, pread_aln=False): tasks = [] tasks_out = {} for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, pread_aln): run_dir = "job_%s" % job_uid cwd = os.path.join(wd, run_dir) job_done_fn = os.path.abspath( os.path.join(cwd, "job_%s_done" % job_uid)) job_done = makePypeLocalFile(job_done_fn) parameters = { "daligner_script": script, "cwd": cwd, "job_uid": job_uid, "config": config, "db_prefix": db_prefix } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done return tasks, tasks_out
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] merge_out = {} p_ids_merge_job_done = [] # for consensus merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) for p_id, merge_script in merge_scripts: job_done = makePypeLocalFile( os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = { "merge_script": merge_script, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config } make_merge_task = PypeTask(inputs={"input_dep": input_dep}, outputs={"job_done": job_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/m_%05d_%s" % (p_id, db_prefix)) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) p_ids_merge_job_done.append((p_id, job_done)) return merge_tasks, merge_out, p_ids_merge_job_done
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False): job_id = 0 tasks = [] tasks_out = {} with open(os.path.join(wd, "run_jobs.sh")) as f : for l in f : l = l.strip().split() if l[0] == "daligner": try: os.makedirs(os.path.join( wd, "./job_%05d" % job_id)) except OSError: pass os.system("cd %s/job_%05d;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_id, db_prefix, db_prefix, db_prefix) ) job_done = makePypeLocalFile(os.path.abspath( "%s/job_%05d/job_%05d_done" % (wd, job_id, job_id) )) if pread_aln == True: l[0] = "daligner_p" parameters = {"daligner_cmd": " ".join(l), "cwd": os.path.join(wd, "job_%05d" % job_id), "job_id": job_id, "config": config} make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/d_%05d_%s" % (job_id, db_prefix) ) daligner_task = make_daligner_task ( run_daligner ) tasks.append( daligner_task ) tasks_out[ "ajob_%d" % job_id ] = job_done job_id += 1 return tasks, tasks_out
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False): import hashlib job_id = 0 tasks = [] tasks_out = {} nblock = 1 new_db = True if os.path.exists( os.path.join(wd, "%s.db" % db_prefix) ): with open( os.path.join(wd, "%s.db" % db_prefix) ) as f: for l in f: l = l.strip().split() if l[0] == "blocks" and l[1] == "=": nblock = int(l[2]) new_db = False break for pid in xrange(1, nblock + 1): try: os.makedirs("%s/m_%05d" % (wd, pid)) except OSError: pass with open(os.path.join(wd, "run_jobs.sh")) as f : for l in f : l = l.strip() job_uid = hashlib.md5(l).hexdigest() job_uid = job_uid[:8] l = l.split() if l[0] == "daligner": try: os.makedirs(os.path.join( wd, "./job_%s" % job_uid)) except OSError: pass os.system("cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_uid, db_prefix, db_pr efix, db_prefix) ) job_done = makePypeLocalFile(os.path.abspath( "%s/job_%s/job_%s_done" % (wd, job_uid, job_uid) )) if pread_aln == True: l[0] = "daligner_p" parameters = {"daligner_cmd": " ".join(l), "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix} make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/d_%s_%s" % (job_uid, db_prefix) ) daligner_task = make_daligner_task ( run_daligner ) tasks.append( daligner_task ) tasks_out[ "ajob_%s" % job_uid ] = job_done job_id += 1 return tasks, tasks_out
def main(): lfn = 'logging-cfg.json' if os.path.exists(lfn): logging.config.dictConfig(json.load(open(lfn))) else: logging.basicConfig() logging.getLogger().setLevel(logging.NOTSET) try: import logging_tree logging_tree.printout() except ImportError: pass log.debug('DEBUG LOGGING ON') log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format( JOB_TYPE, SLEEP_S)) exitOnFailure=False concurrent_jobs=2 #Workflow = pypeflow.controller.PypeThreadWorkflow Workflow = PypeProcWatcherWorkflow Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = Workflow(job_type=JOB_TYPE) par = dict(sleep_s=SLEEP_S) DIR ='mytmp' makedirs(DIR) f0 = makePypeLocalFile('mytmp/f0') f1 = makePypeLocalFile('mytmp/f1') make_task = PypeTask( #inputs = {'f': f}, outputs = {'f0': f0}, parameters = par, TaskType = MyFakePypeThreadTaskBase) task = make_task(taskrun0) wf.addTasks([task]) make_task = PypeTask( inputs = {'f0': f0}, outputs = {'f1': f1}, parameters = par, TaskType = MyFakePypeThreadTaskBase) task = make_task(taskrun1) wf.addTasks([task]) wf.refreshTargets([task])
def create_consensus_gather_task(wd, inputs): # Happens only in stage-0. preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn')) make_cns_gather_task = PypeTask( inputs=inputs, # consensus_out outputs={'preads_fofn': preads_fofn_plf}, TaskType=MyFakePypeThreadTaskBase, URL='task://localhost/cns_gather') task = make_cns_gather_task(pype_tasks.task_cns_gather) return task, preads_fofn_plf
def main(): lfn = 'logging-cfg.json' if os.path.exists(lfn): logging.config.dictConfig(json.load(open(lfn))) else: logging.basicConfig() logging.getLogger().setLevel(logging.NOTSET) try: import logging_tree logging_tree.printout() except ImportError: pass log.debug('DEBUG LOGGING ON') log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format( JOB_TYPE, SLEEP_S)) exitOnFailure = False concurrent_jobs = 2 #Workflow = pypeflow.controller.PypeThreadWorkflow Workflow = PypeProcWatcherWorkflow Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = Workflow(job_type=JOB_TYPE) par = dict(sleep_s=SLEEP_S) DIR = 'mytmp' makedirs(DIR) f0 = makePypeLocalFile('mytmp/f0') f1 = makePypeLocalFile('mytmp/f1') make_task = PypeTask( #inputs = {'f': f}, outputs={'f0': f0}, parameters=par, TaskType=MyFakePypeThreadTaskBase) task = make_task(taskrun0) wf.addTasks([task]) make_task = PypeTask(inputs={'f0': f0}, outputs={'f1': f1}, parameters=par, TaskType=MyFakePypeThreadTaskBase) task = make_task(taskrun1) wf.addTasks([task]) wf.refreshTargets([task])
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done): consensus_tasks = [] consensus_out ={} # Unlike the merge tasks, consensus occurs in a single directory. rdir = os.path.join(wd, 'preads') mkdir(rdir) for p_id, job_done in p_ids_merge_job_done: out_file = makePypeLocalFile(os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile(os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = {"cwd": rdir, "job_id": p_id, "prefix": db_prefix, "config": config} make_c_task = PypeTask(inputs = {"job_done": job_done}, outputs = {"out_file": out_file, "out_done": out_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ct_%05d" % p_id) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return consensus_tasks, consensus_out
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False): job_id = 0 tasks = [] tasks_out = {} nblock = get_nblock(fn(db_file)) xform_script = get_script_xformer(pread_aln) line_count = 0 job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix) for desc, bash in job_descs.iteritems(): #job_uid = hashlib.md5(bash).hexdigest() #job_uid = job_uid[:8] job_uid = '%08d' % line_count line_count += 1 support.make_dirs(os.path.join(wd, "./job_%s" % job_uid)) call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % ( wd, job_uid, db_prefix, db_prefix, db_prefix) rc = system(call) if rc: raise Exception("Failure in system call: %r -> %d" % (call, rc)) job_done = makePypeLocalFile( os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid))) bash = xform_script(bash) parameters = { "daligner_cmd": bash, "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done job_id += 1 return tasks, tasks_out
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False): job_id = 0 tasks = [] tasks_out = {} nblock = get_nblock(fn(db_file)) xform_script = get_script_xformer(pread_aln) line_count = 0 job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix) for desc, bash in job_descs.iteritems(): # job_uid = hashlib.md5(bash).hexdigest() # job_uid = job_uid[:8] job_uid = "%08d" % line_count line_count += 1 support.make_dirs(os.path.join(wd, "./job_%s" % job_uid)) call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % ( wd, job_uid, db_prefix, db_prefix, db_prefix, ) rc = system(call) if rc: raise Exception("Failure in system call: %r -> %d" % (call, rc)) job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid))) bash = xform_script(bash) parameters = { "daligner_cmd": bash, "cwd": os.path.join(wd, "job_%s" % job_uid), "job_uid": job_uid, "config": config, "nblock": nblock, "db_prefix": db_prefix, } make_daligner_task = PypeTask( inputs={"rdb_build_done": rdb_build_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/d_%s_%s" % (job_uid, db_prefix), ) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out["ajob_%s" % job_uid] = job_done job_id += 1 return tasks, tasks_out
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) try: config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn)) except Exception: fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn)) raise input_fofn_plf = makePypeLocalFile(config["input_fofn"]) #Workflow = PypeProcWatcherWorkflow wf = PypeProcWatcherWorkflow(job_type=config['job_type']) run(wf, config, input_fofn_plf=input_fofn_plf, setNumThreadAllowed=PypeProcWatcherWorkflow.setNumThreadAllowed)
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, config, pread_aln=False): tasks = [] tasks_out = {} for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, pread_aln): run_dir = "job_%s" %job_uid cwd = os.path.join(wd, run_dir) job_done_fn = os.path.abspath(os.path.join(cwd, "job_%s_done" %job_uid)) job_done = makePypeLocalFile(job_done_fn) parameters = {"daligner_script": script, "cwd": cwd, "job_uid": job_uid, "config": config, "db_prefix": db_prefix} make_daligner_task = PypeTask(inputs = {"rdb_build_done": rdb_build_done}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/d_%s_%s" %(job_uid, db_prefix)) daligner_task = make_daligner_task(task_run_daligner) tasks.append(daligner_task) tasks_out[ "ajob_%s" % job_uid ] = job_done return tasks, tasks_out
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] merge_out = {} p_ids_merge_job_done = [] # for consensus merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn) for p_id, merge_script in merge_scripts: job_done = makePypeLocalFile(os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = {"merge_script": merge_script, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config} make_merge_task = PypeTask(inputs = {"input_dep": input_dep}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix)) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) p_ids_merge_job_done.append((p_id, job_done)) return merge_tasks, merge_out, p_ids_merge_job_done
def main1(prog_name, input_config_fn, logger_config_fn=None): setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = get_config(parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask( inputs={"input_fofn": rawread_fofn_plf}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={"pdb_build_done": pdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) @PypeTask(inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join(wd) script_fn = os.path.join(script_dir, "run_falcon_asm.sh") script = [] script.append("set -vex") script.append("trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done)) script.append("source {install_prefix}/bin/activate".format( install_prefix=install_prefix)) script.append("cd %s" % pread_dir) # Write preads4falcon.fasta, in 1-preads_ovl: script.append("DB2Falcon -U preads") script.append("cd %s" % wd) script.append("""find %s/las_files -name "*.las" > las.fofn """ % pread_dir) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append("ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append("""fc_graph_to_contig.py""") script.append("""touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-" + str(uuid.uuid4())[:8] job_data = { "job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type=config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_name) wf.addTask(run_falcon_asm_task) wf.refreshTargets(updateFreq=wait_time) #all
def generate_read_to_contig_map(rawread_dir=rawread_dir, pread_dir=pread_dir, asm_dir=asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db")) rawread_id_file = makePypeLocalFile( os.path.join(rawread_dir, "raw_read_ids")) @PypeTask(inputs={"rawread_db": rawread_db}, outputs={"rawread_id_file": rawread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_rawread_ids") def dump_rawread_ids(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file)) wf.addTask(dump_rawread_ids) pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db")) pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids")) @PypeTask(inputs={"pread_db": pread_db}, outputs={"pread_id_file": pread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_pread_ids") def dump_pread_ids(self): pread_db = fn(self.pread_db) pread_id_file = fn(self.pread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file)) wf.addTask(dump_pread_ids) all_raw_las_files = {} for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")): idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_raw_las_files["r_las_%s" % idx] = las_file all_pread_las_files = {} for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")): idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_pread_las_files["p_las_%s" % idx] = las_file wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list")) utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data")) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths")) inputs = { "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "sg_edges_list": sg_edges_list, "utg_data": utg_data, "ctg_paths": ctg_paths } read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map")) @PypeTask(inputs=inputs, outputs={"read_to_contig_map": read_to_contig_map}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_ctg_read_map") def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, "w") as f: for ctg in asm_G.ctg_data: if ctg[-1] == "R": continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(":")[0]) rid = pread_did_to_rid[pid].split("/")[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg) wf.addTask(generate_read_to_ctg_map) def dump_rawread_to_ctg(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) #pread_id_file = fn( self.pread_id_file ) las_file = fn(self.las_file) rawread_to_contig_file = fn(self.rawread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) rid_to_oid = open(rawread_id_file).read().split("\n") #pread_did_to_rid = open(pread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None rid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row rid = int(rid) rid_to_contigs.setdefault(rid, (oid, set())) rid_to_contigs[rid][1].add(ctg) with open(rawread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None for row in sp.check_output( shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: o_id = rid_to_oid[cur_read_id] print >> f, "%09d %s %s %d %d %d %d" % ( cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id if q_id in rid_to_contigs and len( ovlp_data) == 0: #if the query is in some contig.... t_o_id, ctgs = rid_to_contigs[q_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 if t_id not in rid_to_contigs: continue t_o_id, ctgs = rid_to_contigs[t_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 def dump_pread_to_ctg(self): pread_db = fn(self.pread_db) rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) las_file = fn(self.las_file) pread_to_contig_file = fn(self.pread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) pid_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None pid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row pid = int(pid) pid_to_contigs.setdefault(pid, (oid, set())) pid_to_contigs[pid][1].add(ctg) with open(pread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None skip_rest = 0 for row in sp.check_output( shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: rid = pid_to_rid[cur_read_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] print >> f, "%09d %s %s %d %d %d %d" % ( cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id skip_rest = 0 if q_id in pid_to_contigs and len( ovlp_data) == 0: #if the query is in some contig.... t_o_id, ctgs = pid_to_contigs[q_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 skip_rest = 1 if skip_rest == 1: continue if t_id not in pid_to_contigs: continue t_o_id, ctgs = pid_to_contigs[t_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % ( q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile( os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs={ "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file }, outputs={"rawread_to_contig_file": rawread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/r_read_to_contigs.%s" % idx) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg( dump_rawread_to_ctg) wf.addTask(dump_rawread_to_ctg_task) for las_key, las_file in all_pread_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[ -1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) pread_to_contig_file = makePypeLocalFile( os.path.join(read_map_dir, "pread_to_contigs.%s" % idx)) make_dump_pread_to_ctg = PypeTask( inputs={ "las_file": las_file, "pread_db": pread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file }, outputs={"pread_to_contig_file": pread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/pread_to_contigs.%s" % idx) dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg) wf.addTask(dump_pread_to_ctg_task) wf.refreshTargets() # block
sge_log_dir = os.path.abspath("./sge_log") for d in (dist_map_dir, fasta_dir, pa_dir, script_dir, celera_asm_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(sys.argv[1]) concurrent_jobs = config["concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() #### Task to convert bas.h5 and bax.h5 to fasta files, it will generates two fofn files for the queries and targets input_h5_fofn = makePypeLocalFile(os.path.abspath( config["input_fofn_fn"] )) query_fa_fofn = makePypeLocalFile( os.path.join( fasta_dir, "queries.fofn" ) ) target_fa_fofn = makePypeLocalFile( os.path.join( fasta_dir, "targets.fofn" ) ) fasta_dump_done = makePypeLocalFile(os.path.abspath( os.path.join( fasta_dir, "fasta_dump_done") ) ) parameters = {"fasta_dir": fasta_dir, "min_length": config["length_cutoff"], "min_read_score": config["RQ_threshold"]} @PypeTask(inputs = {"input_fofn": input_h5_fofn}, outputs = {"fasta_dump_done": fasta_dump_done, "target_fa_fofn": target_fa_fofn, "query_fa_fofn": query_fa_fofn}, parameters = parameters, TaskType = PypeThreadTaskBase) def h5fofn_to_fasta(self): os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\
def main(argv=sys.argv): global fc_run_logger fc_run_logger = support.setup_logger(None) if len(sys.argv) < 2: print "you need to provide a configuration file to specific a couple cluster running environment" sys.exit(1) config_fn = sys.argv[1] config = ConfigParser.ConfigParser() config.read(config_fn) job_type = "SGE" if config.has_option('General', 'job_type'): job_type = config.get('General', 'job_type') sge_track_reads = " -pe smp 12 -q bigmem" if config.has_option('Unzip', 'sge_track_reads'): sge_track_reads = config.get('Unzip', 'sge_track_reads') sge_quiver = " -pe smp 24 -q bigmem " if config.has_option('Unzip', 'sge_quiver'): sge_quiver = config.get('Unzip', 'sge_quiver') smrt_bin = "/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/" if config.has_option('Unzip', 'smrt_bin'): smrt_bin = config.get('Unzip', 'smrt_bin') input_bam_fofn = "input_bam.fofn" if config.has_option('Unzip', 'input_bam_fofn'): input_bam_fofn = config.get('Unzip', 'input_bam_fofn') quiver_concurrent_jobs = 8 if config.has_option('Unzip', 'quiver_concurrent_jobs'): quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs') config = { "job_type": job_type, "sge_quiver": sge_quiver, "sge_track_reads": sge_track_reads, "input_bam_fofn": input_bam_fofn, "smrt_bin": smrt_bin } support.job_type = "SGE" #tmp hack until we have a configuration parser ctg_ids = [] PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs, quiver_concurrent_jobs) wf = PypeThreadWorkflow() parameters = {"wd": os.path.abspath("."), "config": config} hasm_done = makePypeLocalFile("./3-unzip/1-hasm/hasm_done") job_done = makePypeLocalFile( os.path.join(parameters["wd"], "track_reads_h_done")) make_track_reads_task = PypeTask(inputs={"hasm_done": hasm_done}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/track_reads_h") track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ref_seq_data = {} p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa") ctg_types = {} for r in p_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "p" h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted(ref_seq_data.keys()) p_ctg_out = [] h_ctg_out = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split("-")[0] wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id) mkdir(wd) ref_fasta = makePypeLocalFile( os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id))) if os.path.exists(fn(read_sam)): if ctg_types[ctg_id] == "p": p_ctg_out.append((cns_fasta, cns_fastq)) if ctg_types[ctg_id] == "h": h_ctg_out.append((cns_fasta, cns_fastq)) if not os.path.exists(fn(ref_fasta)): with open(fn(ref_fasta), "w") as f: print >> f, ">" + ctg_id print >> f, sequence parameters = { "job_uid": "q-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_quiver_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_sam": read_sam }, outputs={ "cns_fasta": cns_fasta, "cns_fastq": cns_fastq, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id)) quiver_task = make_quiver_task(task_run_quiver) wf.addTask(quiver_task) wf.refreshTargets() os.system("sleep 30") mkdir("./4-quiver/cns_output") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq") for cns_fasta, cns_fastq in sorted(p_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_p_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_p_ctg.fastq".format( cns_fastq=fn(cns_fastq))) os.system("rm ./4-quiver/cns_output/cns_h_ctg.fasta") os.system("rm ./4-quiver/cns_output/cns_h_ctg.fastq") for cns_fasta, cns_fastq in sorted(h_ctg_out): os.system( "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_h_ctg.fasta".format( cns_fasta=fn(cns_fasta))) os.system( "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_h_ctg.fastq".format( cns_fastq=fn(cns_fastq)))
sge_log_dir = os.path.abspath("./sge_log") for d in (dist_map_dir, fasta_dir, pa_dir, script_dir, celera_asm_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(sys.argv[1]) concurrent_jobs = config["concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() #### Task to convert bas.h5 and bax.h5 to fasta files, it will generates two fofn files for the queries and targets input_h5_fofn = makePypeLocalFile(os.path.abspath(config["input_fofn_fn"])) query_fa_fofn = makePypeLocalFile(os.path.join(fasta_dir, "queries.fofn")) target_fa_fofn = makePypeLocalFile(os.path.join(fasta_dir, "targets.fofn")) fasta_dump_done = makePypeLocalFile( os.path.abspath(os.path.join(fasta_dir, "fasta_dump_done"))) parameters = { "fasta_dir": fasta_dir, "min_length": config["length_cutoff"], "min_read_score": config["RQ_threshold"] } @PypeTask(inputs={"input_fofn": input_h5_fofn}, outputs={ "fasta_dump_done": fasta_dump_done, "target_fa_fofn": target_fa_fofn, "query_fa_fofn": query_fa_fofn
pass try: os.makedirs("%s/las_files" % (wd) ) except OSError: pass with open("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id), "w") as merge_script: #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = {"merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config} make_merge_task = PypeTask( inputs = {"input_dep": input_dep}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) ) merge_task = make_merge_task ( run_merge_task ) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task)
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] Workflow = PypeProcWatcherWorkflow PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeProcWatcherWorkflow(job_type=config['job_type']) input_fofn_plf = makePypeLocalFile(config["input_fofn"]) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = {"work_dir": rawread_dir, "config": config} raw_reads_db = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/rda_check") check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks( rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) concurrent_jobs = config["cns_concurrent_jobs"] PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pda_check") check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask(inputs={ "p_merge_done": p_merge_done, }, outputs={"db2falcon_done": db2falcon_done}, parameters={ "wd": pread_dir, "config": config, }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/db2falcon") wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "db2falcon_done": db2falcon_done, "db_file": preads_db }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/falcon") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets()
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(config["input_fofn"]) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "config": config} raw_reads_db = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "raw_reads.db": raw_reads_db, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs = {"pread_fofn": pread_fofn }, outputs = {"pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done")) @PypeTask( inputs = merge_out, outputs = {"p_merge_done": p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask( inputs = {"p_merge_done": p_merge_done,}, outputs = {"db2falcon_done": db2falcon_done}, parameters = {"wd": pread_dir, "config": config, }, TaskType = PypeThreadTaskBase, URL = "task://localhost/db2falcon" ) wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db}, outputs = {"falcon_asm_done": falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets()
def create_merge_tasks(wd, db_prefix, input_dep, config): merge_tasks = [] consensus_tasks = [] merge_out = {} consensus_out ={} mjob_data = {} with open(os.path.join(wd, "run_jobs.sh")) as f : for l in f: l = l.strip().split() if l[0] not in ( "LAsort", "LAmerge", "mv" ): continue if l[0] == "LAsort": p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int( l[2].split(".")[2] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) else: p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int( l[1].split(".")[2] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) else: p_id = int( l[1].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) for p_id in mjob_data: s_data = mjob_data[p_id] try: os.makedirs("%s/m_%05d" % (wd, p_id)) except OSError: pass try: os.makedirs("%s/preads" % (wd) ) except OSError: pass try: os.makedirs("%s/las_files" % (wd) ) except OSError: pass with open("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id), "w") as merge_script: #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) ) job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id) )) parameters = {"merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config} make_merge_task = PypeTask( inputs = {"input_dep": input_dep}, outputs = {"job_done": job_done}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) ) merge_task = make_merge_task ( run_merge_task ) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) out_file = makePypeLocalFile(os.path.abspath( "%s/preads/out.%05d.fa" % (wd, p_id) )) out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id) )) parameters = {"cwd": os.path.join(wd, "preads" ), "job_id": p_id, "prefix": db_prefix, "config": config} make_c_task = PypeTask( inputs = {"job_done": job_done}, outputs = {"out_file": out_file, "out_done": out_done }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/ct_%05d" % p_id ) c_task = make_c_task( run_consensus_task ) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return merge_tasks, merge_out, consensus_tasks, consensus_out
def unzip_all(config): unzip_concurrent_jobs = config["unzip_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(unzip_concurrent_jobs, unzip_concurrent_jobs) wf = PypeThreadWorkflow() ctg_list_file = makePypeLocalFile("./3-unzip/reads/ctg_list") falcon_asm_done = makePypeLocalFile("./2-asm-falcon/falcon_asm_done") parameters = {"wd": os.path.abspath("."), "config": config} job_done = makePypeLocalFile( os.path.join(parameters["wd"], "track_reads_done")) make_track_reads_task = PypeTask( inputs={"falcon_asm_done": falcon_asm_done}, outputs={ "job_done": job_done, "ctg_list_file": ctg_list_file }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/track_reads") track_reads_task = make_track_reads_task(task_track_reads) wf.addTask(track_reads_task) wf.refreshTargets() #force refresh now, will put proper dependence later ctg_ids = [] with open("./3-unzip/reads/ctg_list") as f: for row in f: row = row.strip() ctg_ids.append(row) aln1_outs = {} all_ctg_out = {} for ctg_id in ctg_ids: # inputs ref_fasta = makePypeLocalFile( "./3-unzip/reads/{ctg_id}_ref.fa".format(ctg_id=ctg_id)) read_fasta = makePypeLocalFile( "./3-unzip/reads/{ctg_id}_reads.fa".format(ctg_id=ctg_id)) # outputs wd = os.path.join( os.getcwd(), "./3-unzip/0-phasing/{ctg_id}/".format(ctg_id=ctg_id)) mkdir(wd) ctg_aln_out = makePypeLocalFile( os.path.join(wd, "{ctg_id}_sorted.bam".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "aln_{ctg_id}_done".format(ctg_id=ctg_id))) parameters = { "job_uid": "aln-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_blasr_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "read_fasta": read_fasta }, outputs={ "ctg_aln_out": ctg_aln_out, "job_done": job_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/aln_{ctg_id}".format(ctg_id=ctg_id)) blasr_task = make_blasr_task(task_run_blasr) aln1_outs[ctg_id] = (ctg_aln_out, job_done) wf.addTask(blasr_task) job_done = makePypeLocalFile( os.path.join(wd, "p_{ctg_id}_done".format(ctg_id=ctg_id))) rid_to_phase_out = makePypeLocalFile( os.path.join(wd, "rid_to_phase.{ctg_id}".format(ctg_id=ctg_id))) all_ctg_out["r2p.{ctg_id}".format(ctg_id=ctg_id)] = rid_to_phase_out parameters = { "job_uid": "ha-" + ctg_id, "wd": wd, "config": config, "ctg_id": ctg_id } make_phasing_task = PypeTask( inputs={ "ref_fasta": ref_fasta, "aln_bam": ctg_aln_out }, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/p_{ctg_id}".format(ctg_id=ctg_id)) phasing_task = make_phasing_task(task_phasing) wf.addTask(phasing_task) wf.refreshTargets() hasm_wd = os.path.abspath("./3-unzip/1-hasm/") mkdir(hasm_wd) rid_to_phase_all = makePypeLocalFile( os.path.join(hasm_wd, "rid_to_phase.all")) @PypeTask(inputs=all_ctg_out, outputs={"rid_to_phase_all": rid_to_phase_all}, TaskType=PypeThreadTaskBase, URL="task://localhost/rid_to_phase_all") def get_rid_to_phase_all(self): rid_to_phase_all_fn = fn(self.rid_to_phase_all) inputs_fn = [fn(f) for f in self.inputs.values()] inputs_fn.sort() output = [] for fname in inputs_fn: output.extend(open(fname).read()) out = open(rid_to_phase_all_fn, "w") out.write("".join(output)) out.close() wf.addTask(get_rid_to_phase_all) parameters["wd"] = hasm_wd job_done = makePypeLocalFile(os.path.join(hasm_wd, "hasm_done")) make_hasm_task = PypeTask(inputs={"rid_to_phase_all": rid_to_phase_all}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/hasm") hasm_task = make_hasm_task(task_hasm) wf.addTask(hasm_task) wf.refreshTargets()
def main(*argv): setup_logger() if len(argv) < 2: print "you need to specify a configuration file" print "example: HGAP.py HGAP_run.cfg" sys.exit(1) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config ) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fa" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": exit(0) if config["input_type"] == "preads": if not os.path.exists( "%s/input_preads.fofn" % pread_dir): os.system( "cp %s %s/input_preads.fofn" % (os.path.abspath( config["input_fofn_fn"] ), pread_dir) ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) rdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "rdb_build_done") ) @PypeTask( inputs = { "pread_fofn": pread_fofn }, outputs = { "rdb_build_done": rdb_build_done }, parameters = {"config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_p_rdb") def build_p_rdb_task(self): config = self.parameters["config"] pread_dir = self.parameters["pread_dir"] fa_serial = 0 for fa_fn in open(fn(self.pread_fofn)).readlines(): fa_fn = fa_fn.strip() c = 0 fa_serial += 1 with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm: f = FastaReader(fa_fn) for r in f: if len(r.sequence) < config["length_cutoff_pr"]: continue name = r.name name = name.replace("_","") ignore_read = False for cc in r.sequence: if cc not in ["A","C","G","T"]: ignore_read = True break if ignore_read: continue print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) ) for i in range(0, len(r.sequence)/80): print >> p_norm, r.sequence[ i *80 : (i + 1) * 80] print >> p_norm, r.sequence[(i+1)*80:] c += 1 os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) ) os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"])) os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"])) os.system("cd %s; touch rdb_build_done" % pread_dir) wf.addTask(build_p_rdb_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, rdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config ) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) @PypeTask( inputs = {"p_merge_done": p_merge_done}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) script.append( "DB2Falcon preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --fofn las.fofn %s \ --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl > fc.log""" ) script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s\n""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_name = self.URL.split("/")[-1] job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": wd, "sge_option": config["sge_option_fc"], "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name ) wf.addTask( run_falcon_asm_task ) wf.refreshTargets(updateFreq = wait_time) #all
def run(wf, config, input_fofn_plf, setNumThreadAllowed, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "sge_option": config["sge_option_da"], "config": config} length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, "length_cutoff")) raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db_plf, "length_cutoff": length_cutoff_plf, "run_jobs": run_jobs, }, parameters = parameters, TaskType = MyFakePypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/rda_check" ) check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) pre_assembly_report_plf = makePypeLocalFile(os.path.join(rawread_dir, "pre_assembly_stats.json")) #tho technically it needs pread_fofn make_task = PypeTask( inputs = {"length_cutoff_fn": length_cutoff_plf, "raw_reads_db": raw_reads_db_plf, "preads_fofn": pread_fofn, }, outputs = {"pre_assembly_report": pre_assembly_report_plf, }, parameters = config, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/report_pre_assembly") task = make_task(task_report_pre_assembly) wf.addTask(task) concurrent_jobs = config["cns_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": log.info("Quitting after stage-0 for 'pre-assembly' target.") sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "sge_option": config["sge_option_pda"], "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs = {"pread_fofn": pread_fofn }, outputs = {"pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs, }, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, parameters = parameters, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/pda_check" ) check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) config["sge_option_la"] = config["sge_option_pla"] merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done")) @PypeTask( inputs = merge_out, outputs = {"p_merge_done": p_merge_done}, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask( inputs = {"p_merge_done": p_merge_done,}, outputs = {"db2falcon_done": db2falcon_done}, parameters = {"wd": pread_dir, "config": config, "sge_option": config["sge_option_fc"], }, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/db2falcon" ) wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db}, outputs = {"falcon_asm_done": falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir, "sge_option": config["sge_option_fc"], }, TaskType = MyFakePypeThreadTaskBase, URL = "task://localhost/falcon_asm" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted(ref_seq_data.keys()) p_ctg_out = [] h_ctg_out = [] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split("-")[0] wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id) mkdir(wd) ref_fasta = makePypeLocalFile( os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id))) read_sam = makePypeLocalFile( os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id=ctg_id))) cns_fasta = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id))) cns_fastq = makePypeLocalFile( os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id))) job_done = makePypeLocalFile( os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id))) if os.path.exists(fn(read_sam)): if ctg_types[ctg_id] == "p": p_ctg_out.append((cns_fasta, cns_fastq)) if ctg_types[ctg_id] == "h":
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info("fc_run started with configuration %s", input_config_fn) config = support.get_dict_from_old_falcon_cfg( support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile( os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/rda_check") def check_r_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) if config["target"] == "overlapping": wf.refreshTargets( updateFreq=wait_time ) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=PypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets( updateFreq=wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "run_jobs": run_jobs }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads")) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) @PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pda_check") def check_p_da_task(self): system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=PypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq=wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "p_merge_done": p_merge_done, "db_file": db_file }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir }, TaskType=PypeThreadTaskBase, URL="task://localhost/falcon") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets(updateFreq=wait_time) #all
job_name += "-"+str(uuid.uuid1())[:8] job_data = {"job_name": job_name, "cwd": cwd, "sge_option": " -pe smp 6 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type = "SGE") wait_for_file( fn( self.job_done ), task=self, job_name=job_name ) if __name__ == "__main__": prefix = sys.argv[1] concurrent_jobs = 64 PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() job_id = 0 db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix )) with open("run_jobs.sh") as f : for l in f : l = l.strip().split() if l[0] == "daligner": try: os.makedirs("./job_%05d" % job_id) except OSError: pass os.system("cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ." % (job_id, prefix, prefix, prefix) ) job_done = makePypeLocalFile(os.path.abspath( "./job_%05d/job_%05d_done" % (job_id,job_id) )) parameters = {"daligner_cmd": " ".join(l), "cwd": os.path.join(os.getcwd(), "job_%05d" % job_id), "job_id": job_id} make_daligner_task = PypeTask( inputs = {"db_file": db_file}, outputs = {"job_done": job_done},
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] consensus_tasks = [] merge_out = {} consensus_out = {} mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) # mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/m_%05d" % (wd, p_id)) support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_script_file = os.path.abspath("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id)) with open(merge_script_file, "w") as merge_script: # print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) job_done = makePypeLocalFile(os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = { "merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config, } make_merge_task = PypeTask( inputs={"input_dep": input_dep}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/m_%05d_%s" % (p_id, db_prefix), ) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) out_file = makePypeLocalFile(os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile(os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = {"cwd": os.path.join(wd, "preads"), "job_id": p_id, "prefix": db_prefix, "config": config} make_c_task = PypeTask( inputs={"job_done": job_done}, outputs={"out_file": out_file, "out_done": out_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/ct_%05d" % p_id, ) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return merge_tasks, merge_out, consensus_tasks, consensus_out
def run_HGAP(config): global prepare_data global prepare_seed_reads global dist_map global generate_preassemble_reads global run_CA global quiver_reseq directory_for_dist_map = "dist_map" config["install_prefix"] = sys.prefix config["directory_for_dist_map"] = directory_for_dist_map input_fofn_fn = config["input_fofn_fn"] #tmpdir = config["tmpdir"] #prepration the distribute mapping directory #try: #os.makedirs("%s/ec_data" % directory_for_dist_map) #os.makedirs("/%s/ec_data" % tmpdir) #except: #pass try: os.makedirs("%s" % directory_for_dist_map) except: pass try: os.makedirs("scripts") except: pass try: os.makedirs("CA") except: pass try: os.makedirs("sge_log") except: pass input_fofn = makePypeLocalFile(input_fofn_fn) normalized_fasta = makePypeLocalFile("all_norm.fa") seed_fasta = makePypeLocalFile("seeds.fa") wf = PypeWorkflow() prepare_data_task = PypeTask(inputDataObjs={"input_fofn":input_fofn}, outputDataObjs={"normalized_fasta":normalized_fasta}, config = config ) (prepare_data) prepare_seed_reads_task = PypeTask(inputDataObjs = {"normalized_fasta":normalized_fasta}, outputDataObjs = {"seed_fasta":seed_fasta}, config = config)(prepare_seed_reads) wf.addTasks([prepare_data_task, prepare_seed_reads_task]) m4_data_done = makePypeLocalFile("%s/m4_data_done" % directory_for_dist_map) dist_map_task = PypeTask(inputDataObjs = {"normalized_fasta":normalized_fasta, "seed_fasta":seed_fasta}, outputDataObjs = {"m4_data_done":m4_data_done}, config = config) (dist_map) m4filtering_done = makePypeLocalFile("%s/m4filtering_done" % directory_for_dist_map) m4filtering_task = PypeTask(inputDataObjs = {"m4_data_done":m4_data_done}, outputDataObjs = {"m4filtering_done":m4filtering_done}, config = config) (m4_filtering) preassembly_done = makePypeLocalFile("%s/preassembly_done" % directory_for_dist_map) get_preassembled_reads_task = PypeTask( inputDataObjs = {"normalized_fasta" : normalized_fasta, "seed_fasta" : seed_fasta, "m4filtering_done" : m4filtering_done}, outputDataObjs = {"preassembly_done" : preassembly_done}, config = config ) (get_preassembled_reads) wf.addTasks([dist_map_task, m4filtering_task, get_preassembled_reads_task]) CA_done = makePypeLocalFile("CA_done") run_CA_task = PypeTask( inputDataObjs = {"preassembly_done" : preassembly_done}, outputDataObjs = {"CA_done": CA_done}, config = config )(run_CA) wf.addTasks([run_CA_task]) Quiver_done = makePypeLocalFile("Quiver_done") quiver_reseq_task = PypeTask( inputDataObjs = {"CA_done": CA_done, "input_fofn":input_fofn}, outputDataObjs = {"Quiver_done": Quiver_done}, config = config) ( quiver_reseq ) wf.addTasks([quiver_reseq_task]) if config["target"] == "all": wf.refreshTargets([Quiver_done]) elif config["target"] == "draft_assembly": wf.refreshTargets([CA_done]) elif config["target"] == "pre_assembly": wf.refreshTargets([preassembly_done])
def testDistributed(runmode, cleanup): logger.info("test start") baseDir = "." import random random.seed(1984) #PypeThreadWorkflow.setNumThreadAllowed(20,20) #wf = PypeThreadWorkflow() PypeMPWorkflow.setNumThreadAllowed(20,20) wf = PypeMPWorkflow() allTasks = [] for layer in range(5): fN = random.randint(3,7) fin = [None] * fN fout = [None] * fN fmut = [None] * fN for w in range(fN): fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) ) fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) ) fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) ) #wf.addObjects([fin[w], fout[w], fmut[w]]) for w in range(fN): inputDataObjs = {} outputDataObjs = {} mutableDataObjs = {} for i in range(5): inputDataObjs["infile%d" % i] = random.choice(fin) i = 0 for obj in random.sample(fmut,2): #mutableDataObjs["outfile%d" % i] = obj i += 1 outputDataObjs["outfile%d" % i] = fout[w] shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10" shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ... >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10" shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w) shfile = open(shellFileName, 'w') print(shellCmd, file=shfile) shfile.close() if runmode == "internal": def t1(self): runShellCmd(["sleep", "%d" % random.randint(0,20) ]) for of in self.outputDataObjs.values(): runShellCmd(["touch", of.localFileName]) task = PypeTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://internal/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( t1 ) elif runmode == "localshell": task = PypeShellTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://localshell/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) elif runmode == "sge": task = PypeSGETask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://sge/task_l%d_w%d" % (layer, w), TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) elif runmode == "mixed": #distributed = random.choice( (False, True) ) distributed = True if w % 3 == 0 else False task = PypeDistributibleTask(inputDataObjs = inputDataObjs, outputDataObjs = outputDataObjs, mutableDataObjs = mutableDataObjs, URL="task://sge/task_l%d_w%d" % (layer, w), distributed=distributed, TaskType=PypeThreadTaskBase) ( "%s" % shellFileName ) wf.addTasks([task]) allTasks.append(task) for URL in wf._pypeObjects: prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"]) if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ] if len(prereqJobURLs) == 0: if cleanup == "1": os.system("echo start > %s" % wf._pypeObjects[URL].localFileName) pass wf.refreshTargets(allTasks) dotFile = open("test.dot","w") #print >>dotFile, wf.graphvizShortNameDot print(wf.graphvizDot, file=dotFile) dotFile.close() dotFile = open("test_short_name.dot","w") print(wf.graphvizShortNameDot, file=dotFile) dotFile.close() rdfFile = open("test.rdf","w") print(wf.RDFXML, file=rdfFile) rdfFile.close() if runmode != "internal": mkFile = open("test.mk","w") print(wf.makeFileStr, file=mkFile) mkFile.close()
def main1(prog_name, input_config_fn, logger_config_fn=None): setup_logger(logger_config_fn) fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) config = get_config(parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config ) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} make_build_pdb_task = PypeTask( inputs = { "pread_fofn": pread_fofn }, outputs = { "pdb_build_done": pdb_build_done }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config ) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) @PypeTask( inputs = {"p_merge_done": p_merge_done, "db_file":db_file}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) def run_falcon_asm_task(self): wd = self.parameters["wd"] config = self.parameters["config"] install_prefix = config["install_prefix"] pread_dir = self.parameters["pread_dir"] script_dir = os.path.join( wd ) script_fn = os.path.join( script_dir ,"run_falcon_asm.sh" ) script = [] script.append( "set -vex" ) script.append( "trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done) ) script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) ) script.append( "cd %s" % pread_dir ) # Write preads4falcon.fasta, in 1-preads_ovl: script.append( "DB2Falcon -U preads") script.append( "cd %s" % wd ) script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir ) overlap_filtering_setting = config["overlap_filtering_setting"] length_cutoff_pr = config["length_cutoff_pr"] script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\ (fn(db_file), overlap_filtering_setting, length_cutoff_pr) ) script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir) script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile # Write 'p_ctg.fa' and 'a_ctg.fa': script.append( """fc_graph_to_contig.py""" ) script.append( """touch %s""" % fn(self.falcon_asm_done)) with open(script_fn, "w") as script_file: script_file.write("\n".join(script)) job_data = make_job_data(self.URL, script_fn) job_data["sge_option"] = config["sge_option_fc"] run_script(job_data, job_type = config["job_type"]) wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_data['job_name']) wf.addTask( run_falcon_asm_task ) wf.refreshTargets(updateFreq = wait_time) #all
def make_dirs(d): if not os.path.isdir(d): os.makedirs(d) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") asm_dir = os.path.abspath(os.path.join("./3-unzip/")) read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db")) rawread_id_file = makePypeLocalFile(os.path.join(rawread_dir, "raw_read_ids")) @PypeTask( inputs={"rawread_db": rawread_db}, outputs={"rawread_id_file": rawread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_rawread_ids", ) def dump_rawread_ids(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file))
def generate_read_to_contig_map(rawread_dir=rawread_dir, pread_dir=pread_dir, asm_dir=asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db")) rawread_id_file = makePypeLocalFile(os.path.join(rawread_dir, "raw_read_ids")) @PypeTask( inputs={"rawread_db": rawread_db}, outputs={"rawread_id_file": rawread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_rawread_ids", ) def dump_rawread_ids(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file)) wf.addTask(dump_rawread_ids) pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db")) pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids")) @PypeTask( inputs={"pread_db": pread_db}, outputs={"pread_id_file": pread_id_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/dump_pread_ids", ) def dump_pread_ids(self): pread_db = fn(self.pread_db) pread_id_file = fn(self.pread_id_file) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file)) wf.addTask(dump_pread_ids) all_raw_las_files = {} for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")): idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_raw_las_files["r_las_%s" % idx] = las_file all_pread_las_files = {} for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")): idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) las_file = makePypeLocalFile(las_fn) all_pread_las_files["p_las_%s" % idx] = las_file wf.refreshTargets() # block sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list")) utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data")) ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths")) inputs = { "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "sg_edges_list": sg_edges_list, "utg_data": utg_data, "ctg_paths": ctg_paths, } read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, "read_to_contig_map")) @PypeTask( inputs=inputs, outputs={"read_to_contig_map": read_to_contig_map}, TaskType=PypeThreadTaskBase, URL="task://localhost/get_ctg_read_map", ) def generate_read_to_ctg_map(self): rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths)) pread_to_contigs = {} with open(read_to_contig_map, "w") as f: for ctg in asm_G.ctg_data: if ctg[-1] == "R": continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(":")[0]) rid = pread_did_to_rid[pid].split("/")[1] rid = int(int(rid) / 10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault(k, set()) pread_to_contigs[k].add(ctg) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[k]): print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg) wf.addTask(generate_read_to_ctg_map) def dump_rawread_to_ctg(self): rawread_db = fn(self.rawread_db) rawread_id_file = fn(self.rawread_id_file) # pread_id_file = fn( self.pread_id_file ) las_file = fn(self.las_file) rawread_to_contig_file = fn(self.rawread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) rid_to_oid = open(rawread_id_file).read().split("\n") # pread_did_to_rid = open(pread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None rid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row rid = int(rid) rid_to_contigs.setdefault(rid, (oid, set())) rid_to_contigs[rid][1].add(ctg) with open(rawread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: o_id = rid_to_oid[cur_read_id] print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id if q_id in rid_to_contigs and len(ovlp_data) == 0: # if the query is in some contig.... t_o_id, ctgs = rid_to_contigs[q_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 if t_id not in rid_to_contigs: continue t_o_id, ctgs = rid_to_contigs[t_id] o_id = rid_to_oid[q_id] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 def dump_pread_to_ctg(self): pread_db = fn(self.pread_db) rawread_id_file = fn(self.rawread_id_file) pread_id_file = fn(self.pread_id_file) read_to_contig_map = fn(self.read_to_contig_map) las_file = fn(self.las_file) pread_to_contig_file = fn(self.pread_to_contig_file) read_to_contig_map = fn(self.read_to_contig_map) pid_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") ovlp_data = [] ovlp_count = 0 longest_ovlp = 0 a_id = None pid_to_contigs = {} with open(read_to_contig_map) as f: for row in f: row = row.strip().split() pid, rid, oid, ctg = row pid = int(pid) pid_to_contigs.setdefault(pid, (oid, set())) pid_to_contigs[pid][1].add(ctg) with open(pread_to_contig_file, "w") as f: ovlp_data = {} cur_read_id = None skip_rest = 0 for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file))).splitlines(): row = row.strip().split() t_id = int(row[1]) q_id = int(row[0]) if q_id != cur_read_id: if cur_read_id == None: cur_read_id = q_id else: if len(ovlp_data) == 0: rid = pid_to_rid[cur_read_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0) else: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 ovlp_data = {} cur_read_id = q_id skip_rest = 0 if q_id in pid_to_contigs and len(ovlp_data) == 0: # if the query is in some contig.... t_o_id, ctgs = pid_to_contigs[q_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1]) ovlp_data[ctg][0] = -int(row[7]) ovlp_data[ctg][1] += 1 skip_rest = 1 if skip_rest == 1: continue if t_id not in pid_to_contigs: continue t_o_id, ctgs = pid_to_contigs[t_id] rid = pid_to_rid[q_id].split("/")[1] rid = int(int(rid) / 10) o_id = rid_to_oid[rid] for ctg in list(ctgs): ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0]) ovlp_data[ctg][0] += int(row[2]) ovlp_data[ctg][1] += 1 if len(ovlp_data) != 0: ovlp_v = ovlp_data.values() ovlp_v.sort() rank = 0 for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v: print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg) rank += 1 for las_key, las_file in all_raw_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx)) make_dump_rawread_to_ctg = PypeTask( inputs={ "las_file": las_file, "rawread_db": rawread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, }, outputs={"rawread_to_contig_file": rawread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/r_read_to_contigs.%s" % idx, ) dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg) wf.addTask(dump_rawread_to_ctg_task) for las_key, las_file in all_pread_las_files.items(): las_fn = fn(las_file) idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number idx = int(idx.split(".")[1]) pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx)) make_dump_pread_to_ctg = PypeTask( inputs={ "las_file": las_file, "pread_db": pread_db, "read_to_contig_map": read_to_contig_map, "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, }, outputs={"pread_to_contig_file": pread_to_contig_file}, TaskType=PypeThreadTaskBase, URL="task://localhost/pread_to_contigs.%s" % idx, ) dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg) wf.addTask(dump_pread_to_ctg_task) wf.refreshTargets() # block
def make_dirs(d): if not os.path.isdir(d): os.makedirs(d) rawread_dir = os.path.abspath( "./0-rawreads" ) pread_dir = os.path.abspath( "./1-preads_ovl" ) asm_dir = os.path.abspath( os.path.join("./3-unzip/") ) read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) ) rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) ) @PypeTask( inputs = {"rawread_db": rawread_db}, outputs = {"rawread_id_file": rawread_id_file}, TaskType = PypeThreadTaskBase, URL = "task://localhost/dump_rawread_ids" ) def dump_rawread_ids(self): rawread_db = fn( self.rawread_db ) rawread_id_file = fn( self.rawread_id_file ) os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) ) wf.addTask( dump_rawread_ids ) pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) ) pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
def simpleTest(): wf = PypeWorkflow() # f1 and f2 are the mock input files f1 = makePypeLocalFile("test.fa") f2 = makePypeLocalFile("ref.fa") # f3 is the object of the expected output of the "testTask" f3 = makePypeLocalFile("aln.txt", readOnly=False) # create the mock files os.system("touch %s" % f1.localFileName) os.system("touch %s" % f2.localFileName) # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln") @PypeTask(inputDataObjs={"fasta":f1, "ref":f2}, outputDataObjs={"aln":f3}, parameters={"a":10}, **{"b":12}) def testTask(*argv, **kwargv): print("testTask is running") print("fasta input filename is %s" % testTask.fasta.localFileName) for ft, f in testTask.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) runShellCmd(["sleep", "5" ]) # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2") f4 = makePypeLocalFile("aln2.txt", readOnly=False) @PypeTask(inputDataObjs={"fasta":f1, "aln":f3}, outputDataObjs={"aln2":f4}, parameters={"a":10}, **{"b":12}) def testTask2(*argv, **kwargv): print("testTask2 is running") for ft, f in testTask2.outputDataObjs.iteritems(): #os.system("touch %s" % f.localFileName) runShellCmd(["touch", "%s" % f.localFileName]) # one can add objects one by one to the workflow #wf.addObjects([f1,f2,f3,f4]) #wf.addObjects([testTask, testTask2]) # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically wf.addTasks([testTask, testTask2]) #print out the RDFXML file that represents the workflow print (wf.RDFXML) #a graphviz dot for rendering the dependency graph if one print (wf.graphvizDot) # execute the workflow until f4 is updated wf.refreshTargets([f4]) # mock the case that f1 is updated print("re-touch f1") os.system("sleep 1;touch %s;" % f1.localFileName) wf.refreshTargets([f4]) # mock the case that f3 is updated print("re-touch f3") os.system("sleep 1;touch %s;" % f3.localFileName)
def run( wf, config, input_fofn_plf, setNumThreadAllowed, ): """ Preconditions (for now): * fc_run_logger * run_support.logger """ rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) exitOnFailure = config[ 'stop_all_jobs_on_failure'] # only matter for parallel jobs concurrent_jobs = config["pa_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) rawread_fofn_plf = makePypeLocalFile( os.path.join(rawread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf}, outputs={"o_fofn": rawread_fofn_plf}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done")) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh")) parameters = { "work_dir": rawread_dir, "sge_option": config["sge_option_da"], "config": config } length_cutoff_plf = makePypeLocalFile( os.path.join(rawread_dir, "length_cutoff")) raw_reads_db_plf = makePypeLocalFile( os.path.join(rawread_dir, "%s.db" % "raw_reads")) make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf}, outputs={ "rdb_build_done": rdb_build_done, "raw_reads_db": raw_reads_db_plf, "length_cutoff": length_cutoff_plf, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf)) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks( fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) wf.addTasks(daligner_tasks) r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done")) parameters = { "nblock": raw_reads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": r_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/rda_check") check_r_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_r_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks( fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks(merge_tasks) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "overlapping": sys.exit(0) consensus_tasks, consensus_out = create_consensus_tasks( rawread_dir, "raw_reads", config, p_ids_merge_job_done) wf.addTasks(consensus_tasks) r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done")) pread_fofn = makePypeLocalFile( os.path.join(pread_dir, "input_preads.fofn")) @PypeTask(inputs=consensus_out, outputs={ "cns_done": r_cns_done, "pread_fofn": pread_fofn }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/cns_check") def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >> f, fa_fn system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) pre_assembly_report_plf = makePypeLocalFile( os.path.join(rawread_dir, "pre_assembly_stats.json") ) #tho technically it needs pread_fofn make_task = PypeTask(inputs={ "length_cutoff_fn": length_cutoff_plf, "raw_reads_db": raw_reads_db_plf, "preads_fofn": pread_fofn, }, outputs={ "pre_assembly_report": pre_assembly_report_plf, }, parameters=config, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/report_pre_assembly") task = make_task(task_report_pre_assembly) wf.addTask(task) concurrent_jobs = config["cns_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) if config["target"] == "pre-assembly": log.info("Quitting after stage-0 for 'pre-assembly' target.") sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile( os.path.join(pread_dir, os.path.basename(config["input_fofn"]))) make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf}, outputs={"o_fofn": pread_fofn}, parameters={}, TaskType=MyFakePypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join(pread_dir, "pdb_build_done")) parameters = { "work_dir": pread_dir, "sge_option": config["sge_option_pda"], "config": config } run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) preads_db = makePypeLocalFile(os.path.join( pread_dir, 'preads.db')) # Also .preads.*, of course. make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn}, outputs={ "pdb_build_done": pdb_build_done, "preads_db": preads_db, "run_jobs": run_jobs, }, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) preads_nblock = support.get_nblock(fn(preads_db)) #### run daligner config["sge_option_da"] = config["sge_option_pda"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True) wf.addTasks(daligner_tasks) p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done")) parameters = { "nblock": preads_nblock, } make_daligner_gather = PypeTask(inputs=daligner_out, outputs={"da_done": p_da_done}, parameters=parameters, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pda_check") check_p_da_task = make_daligner_gather(task_daligner_gather) wf.addTask(check_p_da_task) wf.refreshTargets(exitOnFailure=exitOnFailure) config["sge_option_la"] = config["sge_option_pla"] merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks(merge_tasks) p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done")) @PypeTask(inputs=merge_out, outputs={"p_merge_done": p_merge_done}, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/pmerge_check") def check_p_merge_check_task(self): system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) concurrent_jobs = config["ovlp_concurrent_jobs"] setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf.refreshTargets(exitOnFailure=exitOnFailure) db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done")) make_run_db2falcon = PypeTask(inputs={ "p_merge_done": p_merge_done, }, outputs={"db2falcon_done": db2falcon_done}, parameters={ "wd": pread_dir, "config": config, "sge_option": config["sge_option_fc"], }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/db2falcon") wf.addTask(make_run_db2falcon(task_run_db2falcon)) falcon_asm_done = makePypeLocalFile( os.path.join(falcon_asm_dir, "falcon_asm_done")) make_run_falcon_asm = PypeTask( inputs={ "db2falcon_done": db2falcon_done, "db_file": preads_db }, outputs={"falcon_asm_done": falcon_asm_done}, parameters={ "wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir, "sge_option": config["sge_option_fc"], }, TaskType=MyFakePypeThreadTaskBase, URL="task://localhost/falcon_asm") wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets() return falcon_asm_done
def main1(prog_name, input_config_fn, logger_config_fn=None): global fc_run_logger fc_run_logger = support.setup_logger(logger_config_fn) fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) config = support.get_config(support.parse_config(input_config_fn)) rawread_dir = os.path.abspath("./0-rawreads") pread_dir = os.path.abspath("./1-preads_ovl") falcon_asm_dir = os.path.abspath("./2-asm-falcon") script_dir = os.path.abspath("./scripts") sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): support.make_dirs(d) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"])) rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf}, outputs = {"o_fofn": rawread_fofn_plf}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) if config["input_type"] == "raw": #### import sequences into daligner DB sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) parameters = {"work_dir": rawread_dir, "config": config} make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf}, outputs = {"rdb_build_done": rdb_build_done, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase) build_rdb_task = make_build_rdb_task(task_build_rdb) wf.addTasks([build_rdb_task]) wf.refreshTargets([rdb_build_done]) db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" )) #### run daligner daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":r_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/rda_check" ) def check_r_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_r_da_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed concurrent_jobs = config["cns_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config) wf.addTasks( merge_tasks ) if config["target"] == "overlapping": wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed sys.exit(0) wf.addTasks( consensus_tasks ) r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") ) pread_fofn = makePypeLocalFile( os.path.join( pread_dir, "input_preads.fofn" ) ) @PypeTask( inputs = consensus_out, outputs = {"cns_done":r_cns_done, "pread_fofn": pread_fofn}, TaskType = PypeThreadTaskBase, URL = "task://localhost/cns_check" ) def check_r_cns_task(self): with open(fn(self.pread_fofn), "w") as f: fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir) fn_list.sort() for fa_fn in fn_list: print >>f, fa_fn os.system("touch %s" % fn(self.cns_done)) wf.addTask(check_r_cns_task) wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs if config["target"] == "pre-assembly": sys.exit(0) # build pread database if config["input_type"] == "preads": pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"]))) make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf}, outputs = {"o_fofn": pread_fofn}, parameters = {}, TaskType = PypeThreadTaskBase) fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads) wf.addTasks([fofn_abs_task]) wf.refreshTargets([fofn_abs_task]) pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) parameters = {"work_dir": pread_dir, "config": config} run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh')) make_build_pdb_task = PypeTask(inputs = { "pread_fofn": pread_fofn }, outputs = { "pdb_build_done": pdb_build_done, "run_jobs": run_jobs}, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/build_pdb") build_pdb_task = make_build_pdb_task(task_build_pdb) wf.addTasks([build_pdb_task]) wf.refreshTargets([pdb_build_done]) db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" )) #### run daligner concurrent_jobs = config["ovlp_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) config["sge_option_da"] = config["sge_option_pda"] config["sge_option_la"] = config["sge_option_pla"] daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) wf.addTasks(daligner_tasks) #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") ) @PypeTask( inputs = daligner_out, outputs = {"da_done":p_da_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pda_check" ) def check_p_da_task(self): os.system("touch %s" % fn(self.da_done)) wf.addTask(check_p_da_task) merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config) wf.addTasks( merge_tasks ) #wf.refreshTargets(updateFreq = 30) #all p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") ) @PypeTask( inputs = merge_out, outputs = {"p_merge_done":p_merge_done}, TaskType = PypeThreadTaskBase, URL = "task://localhost/pmerge_check" ) def check_p_merge_check_task(self): os.system("touch %s" % fn(self.p_merge_done)) wf.addTask(check_p_merge_check_task) wf.refreshTargets(updateFreq = wait_time) #all falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") ) make_run_falcon_asm = PypeTask( inputs = {"p_merge_done": p_merge_done, "db_file":db_file}, outputs = {"falcon_asm_done":falcon_asm_done}, parameters = {"wd": falcon_asm_dir, "config": config, "pread_dir": pread_dir}, TaskType = PypeThreadTaskBase, URL = "task://localhost/falcon" ) wf.addTask(make_run_falcon_asm(task_run_falcon_asm)) wf.refreshTargets(updateFreq = wait_time) #all
from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn import logging import time logger = logging.getLogger() #logger.setLevel(logging.INFO) logger.setLevel(logging.DEBUG) formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') ch = logging.StreamHandler() ch.setLevel(logging.DEBUG) ch.setFormatter(formatter) logger.addHandler(ch) inputs = {"input": makePypeLocalFile("/tmp/test1_input")} outputs = {"output": makePypeLocalFile("/tmp/test1_output")} os.system("touch /tmp/test1_input") @PypeTask(inputs = inputs, outputs = outputs, TaskType = PypeThreadTaskBase) def f(self): i = 0 while 1: time.sleep(0.1) if self.shutdown_event != None and self.shutdown_event.is_set(): break if i > 10: break i += 1 if self.shutdown_event == None or not self.shutdown_event.is_set(): os.system("touch %s" % fn(self.output))
"cwd": cwd, "sge_option": " -pe smp 6 -q huasm ", "script_fn": script_fn } run_script(job_data, job_type="SGE") wait_for_file(fn(self.job_done), task=self, job_name=job_name) if __name__ == "__main__": prefix = sys.argv[1] concurrent_jobs = 64 PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() job_id = 0 db_file = makePypeLocalFile(os.path.abspath("./%s.db" % prefix)) with open("run_jobs.sh") as f: for l in f: l = l.strip().split() if l[0] == "daligner": try: os.makedirs("./job_%05d" % job_id) except OSError: pass os.system( "cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ." % (job_id, prefix, prefix, prefix)) job_done = makePypeLocalFile( os.path.abspath("./job_%05d/job_%05d_done" % (job_id, job_id))) parameters = {
ctg_id = args.ctg_id base_dir = args.base_dir ref_seq = "" for r in FastaReader(fasta_fn): rid = r.name.split()[0] if rid != ctg_id: continue ref_seq = r.sequence.upper() PypeThreadWorkflow.setNumThreadAllowed(1, 1) wf = PypeThreadWorkflow() bam_file = makePypeLocalFile(bam_fn) vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_map") ) vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_pos") ) q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map") ) parameters = {} parameters["ctg_id"] = ctg_id parameters["ref_seq"] = ref_seq parameters["base_dir"] = base_dir make_het_call_task = PypeTask( inputs = { "bam_file": bam_file }, outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file }, parameters = parameters, TaskType = PypeThreadTaskBase, URL = "task://localhost/het_call") (make_het_call) wf.addTasks([make_het_call_task])
h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa") for r in h_ctg_fa: rid = r.name.split()[0] ref_seq_data[rid] = r.sequence ctg_types[rid] = "h" ctg_ids = sorted( ref_seq_data.keys() ) p_ctg_out=[] h_ctg_out=[] for ctg_id in ctg_ids: sequence = ref_seq_data[ctg_id] m_ctg_id = ctg_id.split("-")[0] wd = os.path.join( os.getcwd(), "./4-quiver/", m_ctg_id ) mkdir( wd ) ref_fasta = makePypeLocalFile(os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id = ctg_id) ) ) read_sam = makePypeLocalFile(os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id = ctg_id) ) ) cns_fasta = makePypeLocalFile(os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id = ctg_id) ) ) cns_fastq = makePypeLocalFile(os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id = ctg_id) ) ) job_done = makePypeLocalFile(os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id = ctg_id) ) ) if os.path.exists(fn(read_sam)): if ctg_types[ctg_id] == "p": p_ctg_out.append( (cns_fasta, cns_fastq) ) if ctg_types[ctg_id] == "h": h_ctg_out.append( (cns_fasta, cns_fastq) ) if not os.path.exists(fn(ref_fasta)): with open(fn(ref_fasta),"w") as f: print >>f, ">"+ctg_id print >>f, sequence parameters = {"job_uid":"q-"+ctg_id, "wd": wd, "config":config, "ctg_id": ctg_id }
sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(sys.argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath(config["input_fofn_fn"])) rdb_build_done = makePypeLocalFile( os.path.join(rawread_dir, "rdb_build_done")) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask( inputs={"input_fofn": input_h5_fofn}, outputs={"rdb_build_done": rdb_build_done}, parameters=parameters, TaskType=PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done])
sge_log_dir = os.path.abspath("./sge_log") for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir): try: os.makedirs(d) except: pass config = get_config(sys.argv[1]) concurrent_jobs = config["pa_concurrent_jobs"] PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs) wf = PypeThreadWorkflow() if config["input_type"] == "raw": #### import sequences into daligner DB input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) ) rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) parameters = {"work_dir": rawread_dir, "config": config} make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn}, outputs = {"rdb_build_done": rdb_build_done}, parameters = parameters, TaskType = PypeThreadTaskBase) buid_rdb_task = make_buid_rdb_task(build_rdb) wf.addTasks([buid_rdb_task]) wf.refreshTargets([rdb_build_done])
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config): merge_tasks = [] consensus_tasks = [] merge_out = {} consensus_out = {} mjob_data = {} with open(run_jobs_fn) as f: for l in f: l = l.strip().split() if l[0] not in ("LAsort", "LAmerge", "mv"): continue if l[0] == "LAsort": # We now run this part w/ daligner, but we still need # a small script for some book-keeping. p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) #mjob_data[p_id].append( " ".join(l) ) # Already done w/ daligner! if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1][0] == "L": p_id = int(l[2].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[2].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) if l[0] == "mv": l2 = l[1].split(".") if l2[1][0] == "L": p_id = int(l[1].split(".")[2]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) else: p_id = int(l[1].split(".")[1]) mjob_data.setdefault(p_id, []) mjob_data[p_id].append(" ".join(l)) for p_id in mjob_data: s_data = mjob_data[p_id] support.make_dirs("%s/m_%05d" % (wd, p_id)) support.make_dirs("%s/preads" % (wd)) support.make_dirs("%s/las_files" % (wd)) merge_script_file = os.path.abspath("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id)) with open(merge_script_file, "w") as merge_script: #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix) for l in s_data: print >> merge_script, l print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % ( p_id, db_prefix, p_id) print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % ( p_id, db_prefix, p_id) job_done = makePypeLocalFile( os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id))) parameters = { "merge_script": merge_script_file, "cwd": os.path.join(wd, "m_%05d" % p_id), "job_id": p_id, "config": config } make_merge_task = PypeTask(inputs={"input_dep": input_dep}, outputs={"job_done": job_done}, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/m_%05d_%s" % (p_id, db_prefix)) merge_task = make_merge_task(task_run_las_merge) merge_out["mjob_%d" % p_id] = job_done merge_tasks.append(merge_task) out_file = makePypeLocalFile( os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id))) out_done = makePypeLocalFile( os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id))) parameters = { "cwd": os.path.join(wd, "preads"), "job_id": p_id, "prefix": db_prefix, "config": config } make_c_task = PypeTask(inputs={"job_done": job_done}, outputs={ "out_file": out_file, "out_done": out_done }, parameters=parameters, TaskType=PypeThreadTaskBase, URL="task://localhost/ct_%05d" % p_id) c_task = make_c_task(task_run_consensus) consensus_tasks.append(c_task) consensus_out["cjob_%d" % p_id] = out_done return merge_tasks, merge_out, consensus_tasks, consensus_out
if l[0] == "LAsort": p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) if l[0] == "LAmerge": l2 = l[2].split(".") if l2[1] == "L2": p_id = int( l[2].split(".")[2] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) else: p_id = int( l[2].split(".")[1] ) mjob_data.setdefault( p_id, [] ) mjob_data[p_id].append( " ".join(l) ) db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix )) for p_id in mjob_data: s_data = mjob_data[p_id] try: os.makedirs("./p_%05d" % p_id) os.makedirs("./p_%05d/sge_log" % p_id) except OSError: pass try: os.makedirs("./preads") except OSError: pass try: os.makedirs("./las_files")
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir): read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps")) make_dirs(read_map_dir) PypeMPWorkflow.setNumThreadAllowed(12, 12) wf = PypeMPWorkflow() rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) ) rawread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "raw_read_ids" ) ) @PypeTask( inputs = {"rawread_db": rawread_db}, outputs = {"rawread_id_file": rawread_id_file}, TaskType = PypeThreadTaskBase, URL = "task://localhost/dump_rawread_ids" ) def dump_rawread_ids(self): rawread_db = fn( self.rawread_db ) rawread_id_file = fn( self.rawread_id_file ) os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (rawread_db, rawread_id_file) ) wf.addTask( dump_rawread_ids ) pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) ) pread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "pread_ids" ) ) @PypeTask( inputs = {"pread_db": pread_db}, outputs = {"pread_id_file": pread_id_file}, TaskType = PypeThreadTaskBase, URL = "task://localhost/dump_pread_ids" ) def dump_pread_ids(self): pread_db = fn( self.pread_db ) pread_id_file = fn( self.pread_id_file ) os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (pread_db, pread_id_file) ) wf.addTask( dump_pread_ids ) wf.refreshTargets() # block sg_edges_list = makePypeLocalFile( os.path.join(asm_dir, "sg_edges_list") ) utg_data = makePypeLocalFile( os.path.join(asm_dir, "utg_data") ) ctg_paths = makePypeLocalFile( os.path.join(asm_dir, "ctg_paths") ) inputs = { "rawread_id_file": rawread_id_file, "pread_id_file": pread_id_file, "sg_edges_list": sg_edges_list, "utg_data": utg_data, "ctg_paths": ctg_paths } read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") ) @PypeTask( inputs = inputs, outputs = {"read_to_contig_map": read_to_contig_map}, TaskType = PypeThreadTaskBase, URL = "task://localhost/get_ctg_read_map" ) def generate_read_to_ctg_map(self): rawread_id_file = fn( self.rawread_id_file ) pread_id_file = fn( self.pread_id_file ) read_to_contig_map = fn( self.read_to_contig_map ) pread_did_to_rid = open(pread_id_file).read().split("\n") rid_to_oid = open(rawread_id_file).read().split("\n") asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths) ) pread_to_contigs = {} with open(read_to_contig_map, "w") as f: for ctg in asm_G.ctg_data: if ctg[-1] == "R": continue ctg_g = asm_G.get_sg_for_ctg(ctg) for n in ctg_g.nodes(): pid = int(n.split(":")[0]) rid = pread_did_to_rid[pid].split("/")[1] rid = int(int(rid)/10) oid = rid_to_oid[rid] k = (pid, rid, oid) pread_to_contigs.setdefault( k, set() ) pread_to_contigs[ k ].add( ctg ) for k in pread_to_contigs: pid, rid, oid = k for ctg in list(pread_to_contigs[ k ]): print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg) wf.addTask( generate_read_to_ctg_map ) wf.refreshTargets() # block