Beispiel #1
0
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done):
    consensus_tasks = []
    consensus_out = {}
    # Unlike the merge tasks, consensus occurs in a single directory.
    rdir = os.path.join(wd, 'preads')
    mkdir(rdir)
    for p_id, job_done in p_ids_merge_job_done:
        out_file = makePypeLocalFile(
            os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id)))
        out_done = makePypeLocalFile(
            os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id)))
        parameters = {
            "cwd": rdir,
            "job_id": p_id,
            "prefix": db_prefix,
            "config": config
        }
        make_c_task = PypeTask(inputs={"job_done": job_done},
                               outputs={
                                   "out_file": out_file,
                                   "out_done": out_done
                               },
                               parameters=parameters,
                               TaskType=MyFakePypeThreadTaskBase,
                               URL="task://localhost/ct_%05d" % p_id)
        c_task = make_c_task(task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done
    return consensus_tasks, consensus_out
Beispiel #2
0
def simpleTest2():

    wf = PypeWorkflow()

    f1 = makePypeLocalFile("test.fa")
    f2 = makePypeLocalFile("ref.fa")
    f3 = makePypeLocalFile("aln.txt", readOnly=False)
    f4 = makePypeLocalFile("aln2.txt", readOnly=False)

    os.system("touch %s" % f1.localFileName)
    os.system("touch %s" % f2.localFileName)

    @PypeTask(inputDataObjs={
        "fasta": f1,
        "ref": f2
    },
              outputDataObjs={"aln": f3},
              parameters={"a": 10},
              **{"b": 12})
    def testTask(*argv, **kwargv):
        print("testTask is running")
        for ft, f in testTask.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])
            runShellCmd(["sleep", "5"])

    @PypeTask(inputDataObjs={
        "fasta": f1,
        "aln": f3
    },
              outputDataObjs={"aln2": f4},
              parameters={"a": 10},
              **{"b": 12})
    def testTask2(*argv, **kwargv):
        print("testTask2 is running")
        for ft, f in testTask2.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])

    #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2])

    wf.addTasks([testTask, testTask2])

    print(wf.RDFXML)
    print(wf.graphvizDot)

    #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort())

    wf.refreshTargets([f4])

    print("re-touch f1")
    os.system("sleep 1;touch %s;" % f1.localFileName)
    wf.refreshTargets([f4])

    print("re-touch f3")
    os.system("sleep 1;touch %s;" % f3.localFileName)
Beispiel #3
0
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done):
    consensus_tasks = []
    consensus_out = {}
    fasta_plfs = []
    for p_id, job_done in p_ids_merge_job_done:
        cns_label = 'cns_%05d' % p_id
        rdir = os.path.join(wd, 'preads', cns_label)
        mkdir(rdir)
        out_done = makePypeLocalFile(
            os.path.abspath("%s/%s_done" % (rdir, cns_label)))
        out_file = makePypeLocalFile(
            os.path.abspath("%s/%s.fasta" % (rdir, cns_label)))
        fasta_plfs.append(out_file)
        parameters = {
            "cwd": rdir,
            "job_id": p_id,
            "prefix": db_prefix,
            "sge_option": config["sge_option_cns"],
            "config": config
        }
        make_c_task = PypeTask(inputs={"job_done": job_done},
                               outputs={
                                   "out_file": out_file,
                                   "out_done": out_done
                               },
                               parameters=parameters,
                               TaskType=MyFakePypeThreadTaskBase,
                               URL="task://localhost/%s" % cns_label)
        c_task = make_c_task(task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done

    r_cns_done_plf = makePypeLocalFile(os.path.join(wd, 'preads', "cns_done"))
    pread_fofn_plf = makePypeLocalFile(
        os.path.join(wd, 'preads', "input_preads.fofn"))

    @PypeTask(inputs=consensus_out,
              outputs={
                  "cns_done": r_cns_done_plf,
                  "pread_fofn": pread_fofn_plf
              },
              TaskType=MyFakePypeThreadTaskBase,
              URL="task://localhost/cns_check")
    def check_r_cns_task(self):
        with open(fn(self.pread_fofn), "w") as f:
            for fa_fn in sorted(fn(plf) for plf in fasta_plfs):
                print >> f, fa_fn
        system("touch %s" % fn(self.cns_done))

    consensus_tasks.append(check_r_cns_task)
    return consensus_tasks, pread_fofn_plf
Beispiel #4
0
def create_merge_gather_task(wd, inputs):
    las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn'))
    las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn'))

    make_task = PypeTask(
        inputs=inputs,  # p_ids_merged_las
        outputs={
            'las_fofn': las_fofn_plf,
            'las_fopfn': las_fopfn_plf,
        },
        TaskType=MyFakePypeThreadTaskBase,
    )
    #                     URL = 'task://localhost/pmerge_gather')
    task = make_task(pype_tasks.task_merge_gather)
    return task, las_fofn_plf, las_fopfn_plf
def simpleTest2():

    wf = PypeWorkflow()

    f1 = makePypeLocalFile("test.fa")
    f2 = makePypeLocalFile("ref.fa")
    f3 = makePypeLocalFile("aln.txt", readOnly=False)
    f4 = makePypeLocalFile("aln2.txt", readOnly=False)

    os.system("touch %s" % f1.localFileName)
    os.system("touch %s" % f2.localFileName)
    
    @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
              outputDataObjs={"aln":f3},
              parameters={"a":10}, **{"b":12})
    def testTask(*argv, **kwargv):
        print("testTask is running")
        for ft, f in testTask.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])
            runShellCmd(["sleep", "5" ])

    @PypeTask(inputDataObjs={"fasta":f1, "aln":f3},
              outputDataObjs={"aln2":f4},
              parameters={"a":10}, **{"b":12})
    def testTask2(*argv, **kwargv):
        print("testTask2 is running")
        for ft, f in testTask2.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])
        
    #wf.addObjects([f1,f2,f3,f4]) wf.addObjects([testTask, testTask2])
    
    wf.addTasks([testTask, testTask2])

    print (wf.RDFXML)
    print (wf.graphvizDot)

    #aGraph = PypeGraph(wf._RDFGraph) print(aGraph.tSort())

    wf.refreshTargets([f4])

    print("re-touch f1")
    os.system("sleep 1;touch %s;" % f1.localFileName)
    wf.refreshTargets([f4])

    print("re-touch f3")
    os.system("sleep 1;touch %s;" % f3.localFileName)
Beispiel #6
0
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False):
    job_id = 0
    tasks = []
    tasks_out = {}

    nblock = 1
    new_db = True
    if os.path.exists(fn(db_file)):
        with open(fn(db_file)) as f:
            for l in f:
                l = l.strip().split()
                if l[0] == "blocks" and l[1] == "=":
                    nblock = int(l[2])
                    new_db = False
                    break

    for pid in xrange(1, nblock + 1):
        support.make_dirs("%s/m_%05d" % (wd, pid))

    with open(os.path.join(wd, "run_jobs.sh")) as f:
        for l in f:
            l = l.strip()
            job_uid = hashlib.md5(l).hexdigest()
            job_uid = job_uid[:8]
            l = l.split()
            if l[0] == "daligner":
                support.make_dirs(os.path.join(wd, "./job_%s" % job_uid))
                call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (
                    wd,
                    job_uid,
                    db_prefix,
                    db_prefix,
                    db_prefix,
                )
                rc = os.system(call)
                if rc:
                    raise Exception("Failure in system call: %r -> %d" % (call, rc))
                job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)))
                if pread_aln == True:
                    l[0] = "daligner_p"
                parameters = {
                    "daligner_cmd": " ".join(l),
                    "cwd": os.path.join(wd, "job_%s" % job_uid),
                    "job_uid": job_uid,
                    "config": config,
                    "nblock": nblock,
                    "db_prefix": db_prefix,
                }
                make_daligner_task = PypeTask(
                    inputs={"rdb_build_done": rdb_build_done},
                    outputs={"job_done": job_done},
                    parameters=parameters,
                    TaskType=PypeThreadTaskBase,
                    URL="task://localhost/d_%s_%s" % (job_uid, db_prefix),
                )
                daligner_task = make_daligner_task(run_daligner)
                tasks.append(daligner_task)
                tasks_out["ajob_%s" % job_uid] = job_done
                job_id += 1
    return tasks, tasks_out
Beispiel #7
0
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, nblock, config, pread_aln=False):
    tasks = []
    tasks_out = {}
    skip_checks = config.get('skip_checks')
    fc_run_logger.info('Skip LAcheck after daligner? {}'.format(skip_checks))
    for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, nblock, pread_aln, skip_checks):
        run_dir = "job_%s" %job_uid
        cwd = os.path.join(wd, run_dir)
        job_done_fn = os.path.abspath(os.path.join(cwd, "job_%s_done" %job_uid))
        job_done = makePypeLocalFile(job_done_fn)
        parameters =  {"daligner_script": script,
                       "cwd": cwd,
                       "job_uid": job_uid,
                       "config": config,
                       "sge_option": config["sge_option_da"],
                       "db_prefix": db_prefix}
        make_daligner_task = PypeTask(inputs = {"rdb_build_done": rdb_build_done},
                                      outputs = {"job_done": job_done},
                                      parameters = parameters,
                                      TaskType = MyFakePypeThreadTaskBase,
                                      URL = "task://localhost/d_%s_%s" %(job_uid, db_prefix))
        daligner_task = make_daligner_task(task_run_daligner)
        tasks.append(daligner_task)
        tasks_out[ "ajob_%s" % job_uid ] = job_done
    return tasks, tasks_out
Beispiel #8
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    try:
        config = support.get_dict_from_old_falcon_cfg(
            support.parse_config(input_config_fn))
    except Exception:
        fc_run_logger.exception(
            'Failed to parse config "{}".'.format(input_config_fn))
        raise
    input_fofn_plf = makePypeLocalFile(config["input_fofn"])
    #Workflow = PypeProcWatcherWorkflow
    wf = PypeProcWatcherWorkflow(
        job_type=config['job_type'],
        job_queue=config['job_queue'],
        sge_option=config.get('sge_option', ''),
        watcher_type=config['pwatcher_type'],
        watcher_directory=config['pwatcher_directory'])
    run(wf,
        config,
        os.path.abspath(input_config_fn),
        input_fofn_plf=input_fofn_plf,
        setNumThreadAllowed=PypeProcWatcherWorkflow.setNumThreadAllowed)
Beispiel #9
0
def create_daligner_tasks(run_jobs_fn,
                          wd,
                          db_prefix,
                          rdb_build_done,
                          config,
                          pread_aln=False):
    tasks = []
    tasks_out = {}
    for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix,
                                                 rdb_build_done, pread_aln):
        run_dir = "job_%s" % job_uid
        cwd = os.path.join(wd, run_dir)
        job_done_fn = os.path.abspath(
            os.path.join(cwd, "job_%s_done" % job_uid))
        job_done = makePypeLocalFile(job_done_fn)
        parameters = {
            "daligner_script": script,
            "cwd": cwd,
            "job_uid": job_uid,
            "config": config,
            "db_prefix": db_prefix
        }
        make_daligner_task = PypeTask(
            inputs={"rdb_build_done": rdb_build_done},
            outputs={"job_done": job_done},
            parameters=parameters,
            TaskType=MyFakePypeThreadTaskBase,
            URL="task://localhost/d_%s_%s" % (job_uid, db_prefix))
        daligner_task = make_daligner_task(task_run_daligner)
        tasks.append(daligner_task)
        tasks_out["ajob_%s" % job_uid] = job_done
    return tasks, tasks_out
Beispiel #10
0
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
    merge_tasks = []
    merge_out = {}
    p_ids_merge_job_done = []  # for consensus

    merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn)
    for p_id, merge_script in merge_scripts:
        job_done = makePypeLocalFile(
            os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)))
        parameters = {
            "merge_script": merge_script,
            "cwd": os.path.join(wd, "m_%05d" % p_id),
            "job_id": p_id,
            "config": config
        }
        make_merge_task = PypeTask(inputs={"input_dep": input_dep},
                                   outputs={"job_done": job_done},
                                   parameters=parameters,
                                   TaskType=MyFakePypeThreadTaskBase,
                                   URL="task://localhost/m_%05d_%s" %
                                   (p_id, db_prefix))
        merge_task = make_merge_task(task_run_las_merge)
        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)
        p_ids_merge_job_done.append((p_id, job_done))
    return merge_tasks, merge_out, p_ids_merge_job_done
Beispiel #11
0
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False):

    job_id = 0
    tasks = []
    tasks_out = {}
    with open(os.path.join(wd,  "run_jobs.sh")) as f :
        for l in f :
            l = l.strip().split()
            if l[0] == "daligner":
                try:
                    os.makedirs(os.path.join( wd, "./job_%05d" % job_id))
                except OSError:
                    pass
                os.system("cd %s/job_%05d;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_id, db_prefix, db_prefix, db_prefix) )
                job_done = makePypeLocalFile(os.path.abspath( "%s/job_%05d/job_%05d_done" % (wd, job_id, job_id)  ))
                if pread_aln == True:
                    l[0] = "daligner_p"
                parameters =  {"daligner_cmd": " ".join(l),
                               "cwd": os.path.join(wd, "job_%05d" % job_id),
                               "job_id": job_id,
                               "config": config}
                make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done},
                                               outputs = {"job_done": job_done},
                                               parameters = parameters,
                                               TaskType = PypeThreadTaskBase,
                                               URL = "task://localhost/d_%05d_%s" % (job_id, db_prefix) )
                daligner_task = make_daligner_task ( run_daligner )
                tasks.append( daligner_task )
                tasks_out[ "ajob_%d" % job_id ] = job_done
                job_id += 1
    return tasks, tasks_out
def create_daligner_tasks(wd, db_prefix, db_file, rdb_build_done, config, pread_aln = False):

    import hashlib
    job_id = 0
    tasks = []
    tasks_out = {}

    nblock = 1
    new_db = True
    if os.path.exists( os.path.join(wd, "%s.db" % db_prefix) ):
        with open(  os.path.join(wd, "%s.db" % db_prefix) ) as f:
            for l in f:
                l = l.strip().split()
                if l[0] == "blocks" and l[1] == "=":
                    nblock = int(l[2])
                    new_db = False
                    break

    for pid in xrange(1, nblock + 1):
        try:
            os.makedirs("%s/m_%05d" % (wd, pid))
        except OSError:
            pass


    with open(os.path.join(wd,  "run_jobs.sh")) as f :
        for l in f :
            l = l.strip()
            job_uid = hashlib.md5(l).hexdigest()
            job_uid = job_uid[:8]
            l = l.split()
            if l[0] == "daligner":
                try:
                    os.makedirs(os.path.join( wd, "./job_%s" % job_uid))
                except OSError:
                    pass
                os.system("cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (wd, job_uid, db_prefix, db_pr
efix, db_prefix) )
                job_done = makePypeLocalFile(os.path.abspath( "%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)  ))
                if pread_aln == True:
                    l[0] = "daligner_p"
                parameters =  {"daligner_cmd": " ".join(l),
                               "cwd": os.path.join(wd, "job_%s" % job_uid),
                               "job_uid": job_uid,
                               "config": config,
                               "nblock": nblock,
                               "db_prefix": db_prefix}
                make_daligner_task = PypeTask( inputs = {"rdb_build_done": rdb_build_done},
                                               outputs = {"job_done": job_done},
                                               parameters = parameters,
                                               TaskType = PypeThreadTaskBase,
                                               URL = "task://localhost/d_%s_%s" % (job_uid, db_prefix) )
                daligner_task = make_daligner_task ( run_daligner )
                tasks.append( daligner_task )
                tasks_out[ "ajob_%s" % job_uid ] = job_done
                job_id += 1
    return tasks, tasks_out
Beispiel #13
0
def main():
    lfn = 'logging-cfg.json'
    if os.path.exists(lfn):
        logging.config.dictConfig(json.load(open(lfn)))
    else:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.NOTSET)
        try:
            import logging_tree
            logging_tree.printout()
        except ImportError:
            pass
    log.debug('DEBUG LOGGING ON')
    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
        JOB_TYPE, SLEEP_S))
    exitOnFailure=False
    concurrent_jobs=2
    #Workflow = pypeflow.controller.PypeThreadWorkflow
    Workflow = PypeProcWatcherWorkflow
    Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = Workflow(job_type=JOB_TYPE)

    par = dict(sleep_s=SLEEP_S)
    DIR ='mytmp'
    makedirs(DIR)
    f0 = makePypeLocalFile('mytmp/f0')
    f1 = makePypeLocalFile('mytmp/f1')
    make_task = PypeTask(
            #inputs = {'f': f},
            outputs = {'f0': f0},
            parameters = par,
            TaskType = MyFakePypeThreadTaskBase)
    task = make_task(taskrun0)
    wf.addTasks([task])
    make_task = PypeTask(
            inputs = {'f0': f0},
            outputs = {'f1': f1},
            parameters = par,
            TaskType = MyFakePypeThreadTaskBase)
    task = make_task(taskrun1)
    wf.addTasks([task])
    wf.refreshTargets([task])
Beispiel #14
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
        inputs=inputs,  # consensus_out
        outputs={'preads_fofn': preads_fofn_plf},
        TaskType=MyFakePypeThreadTaskBase,
        URL='task://localhost/cns_gather')
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Beispiel #15
0
def main():
    lfn = 'logging-cfg.json'
    if os.path.exists(lfn):
        logging.config.dictConfig(json.load(open(lfn)))
    else:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.NOTSET)
        try:
            import logging_tree
            logging_tree.printout()
        except ImportError:
            pass
    log.debug('DEBUG LOGGING ON')
    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
        JOB_TYPE, SLEEP_S))
    exitOnFailure = False
    concurrent_jobs = 2
    #Workflow = pypeflow.controller.PypeThreadWorkflow
    Workflow = PypeProcWatcherWorkflow
    Workflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = Workflow(job_type=JOB_TYPE)

    par = dict(sleep_s=SLEEP_S)
    DIR = 'mytmp'
    makedirs(DIR)
    f0 = makePypeLocalFile('mytmp/f0')
    f1 = makePypeLocalFile('mytmp/f1')
    make_task = PypeTask(
        #inputs = {'f': f},
        outputs={'f0': f0},
        parameters=par,
        TaskType=MyFakePypeThreadTaskBase)
    task = make_task(taskrun0)
    wf.addTasks([task])
    make_task = PypeTask(inputs={'f0': f0},
                         outputs={'f1': f1},
                         parameters=par,
                         TaskType=MyFakePypeThreadTaskBase)
    task = make_task(taskrun1)
    wf.addTasks([task])
    wf.refreshTargets([task])
Beispiel #16
0
def create_consensus_tasks(wd, db_prefix, config, p_ids_merge_job_done):
    consensus_tasks = []
    consensus_out ={}
    # Unlike the merge tasks, consensus occurs in a single directory.
    rdir = os.path.join(wd, 'preads')
    mkdir(rdir)
    for p_id, job_done in p_ids_merge_job_done:
        out_file = makePypeLocalFile(os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id)))
        out_done = makePypeLocalFile(os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id)))
        parameters =  {"cwd": rdir,
                       "job_id": p_id, 
                       "prefix": db_prefix,
                       "config": config}
        make_c_task = PypeTask(inputs = {"job_done": job_done},
                               outputs = {"out_file": out_file, "out_done": out_done},
                               parameters = parameters,
                               TaskType = PypeThreadTaskBase,
                               URL = "task://localhost/ct_%05d" % p_id)
        c_task = make_c_task(task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done 
    return consensus_tasks, consensus_out
Beispiel #17
0
def create_daligner_tasks(run_jobs_fn,
                          wd,
                          db_prefix,
                          db_file,
                          rdb_build_done,
                          config,
                          pread_aln=False):
    job_id = 0
    tasks = []
    tasks_out = {}

    nblock = get_nblock(fn(db_file))

    xform_script = get_script_xformer(pread_aln)

    line_count = 0
    job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix)
    for desc, bash in job_descs.iteritems():
        #job_uid = hashlib.md5(bash).hexdigest()
        #job_uid = job_uid[:8]
        job_uid = '%08d' % line_count
        line_count += 1

        support.make_dirs(os.path.join(wd, "./job_%s" % job_uid))
        call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (
            wd, job_uid, db_prefix, db_prefix, db_prefix)
        rc = system(call)
        if rc:
            raise Exception("Failure in system call: %r -> %d" % (call, rc))
        job_done = makePypeLocalFile(
            os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)))
        bash = xform_script(bash)
        parameters = {
            "daligner_cmd": bash,
            "cwd": os.path.join(wd, "job_%s" % job_uid),
            "job_uid": job_uid,
            "config": config,
            "nblock": nblock,
            "db_prefix": db_prefix
        }
        make_daligner_task = PypeTask(
            inputs={"rdb_build_done": rdb_build_done},
            outputs={"job_done": job_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/d_%s_%s" % (job_uid, db_prefix))
        daligner_task = make_daligner_task(task_run_daligner)
        tasks.append(daligner_task)
        tasks_out["ajob_%s" % job_uid] = job_done
        job_id += 1
    return tasks, tasks_out
Beispiel #18
0
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, db_file, rdb_build_done, config, pread_aln=False):
    job_id = 0
    tasks = []
    tasks_out = {}

    nblock = get_nblock(fn(db_file))

    xform_script = get_script_xformer(pread_aln)

    line_count = 0
    job_descs = get_daligner_job_descriptions(open(run_jobs_fn), db_prefix)
    for desc, bash in job_descs.iteritems():
        # job_uid = hashlib.md5(bash).hexdigest()
        # job_uid = job_uid[:8]
        job_uid = "%08d" % line_count
        line_count += 1

        support.make_dirs(os.path.join(wd, "./job_%s" % job_uid))
        call = "cd %s/job_%s;ln -sf ../.%s.bps .; ln -sf ../.%s.idx .; ln -sf ../%s.db ." % (
            wd,
            job_uid,
            db_prefix,
            db_prefix,
            db_prefix,
        )
        rc = system(call)
        if rc:
            raise Exception("Failure in system call: %r -> %d" % (call, rc))
        job_done = makePypeLocalFile(os.path.abspath("%s/job_%s/job_%s_done" % (wd, job_uid, job_uid)))
        bash = xform_script(bash)
        parameters = {
            "daligner_cmd": bash,
            "cwd": os.path.join(wd, "job_%s" % job_uid),
            "job_uid": job_uid,
            "config": config,
            "nblock": nblock,
            "db_prefix": db_prefix,
        }
        make_daligner_task = PypeTask(
            inputs={"rdb_build_done": rdb_build_done},
            outputs={"job_done": job_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/d_%s_%s" % (job_uid, db_prefix),
        )
        daligner_task = make_daligner_task(task_run_daligner)
        tasks.append(daligner_task)
        tasks_out["ajob_%s" % job_uid] = job_done
        job_id += 1
    return tasks, tasks_out
Beispiel #19
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    try:
        config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
    except Exception:
        fc_run_logger.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    input_fofn_plf = makePypeLocalFile(config["input_fofn"])
    #Workflow = PypeProcWatcherWorkflow
    wf = PypeProcWatcherWorkflow(job_type=config['job_type'])
    run(wf, config,
            input_fofn_plf=input_fofn_plf,
            setNumThreadAllowed=PypeProcWatcherWorkflow.setNumThreadAllowed)
Beispiel #20
0
def create_daligner_tasks(run_jobs_fn, wd, db_prefix, rdb_build_done, config, pread_aln=False):
    tasks = []
    tasks_out = {}
    for job_uid, script in bash.scripts_daligner(run_jobs_fn, db_prefix, rdb_build_done, pread_aln):
        run_dir = "job_%s" %job_uid
        cwd = os.path.join(wd, run_dir)
        job_done_fn = os.path.abspath(os.path.join(cwd, "job_%s_done" %job_uid))
        job_done = makePypeLocalFile(job_done_fn)
        parameters =  {"daligner_script": script,
                       "cwd": cwd,
                       "job_uid": job_uid,
                       "config": config,
                       "db_prefix": db_prefix}
        make_daligner_task = PypeTask(inputs = {"rdb_build_done": rdb_build_done},
                                      outputs = {"job_done": job_done},
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase,
                                      URL = "task://localhost/d_%s_%s" %(job_uid, db_prefix))
        daligner_task = make_daligner_task(task_run_daligner)
        tasks.append(daligner_task)
        tasks_out[ "ajob_%s" % job_uid ] = job_done
    return tasks, tasks_out
Beispiel #21
0
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
    merge_tasks = []
    merge_out = {}
    p_ids_merge_job_done = [] # for consensus

    merge_scripts = bash.scripts_merge(config, db_prefix, run_jobs_fn)
    for p_id, merge_script in merge_scripts:
        job_done = makePypeLocalFile(os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)))
        parameters =  {"merge_script": merge_script,
                       "cwd": os.path.join(wd, "m_%05d" % p_id),
                       "job_id": p_id,
                       "config": config}
        make_merge_task = PypeTask(inputs = {"input_dep": input_dep},
                                   outputs = {"job_done": job_done},
                                   parameters = parameters,
                                   TaskType = PypeThreadTaskBase,
                                   URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix))
        merge_task = make_merge_task(task_run_las_merge)
        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)
        p_ids_merge_job_done.append((p_id, job_done))
    return merge_tasks, merge_out, p_ids_merge_job_done
Beispiel #22
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    config = get_config(parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        make_dirs(d)

    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    input_fofn_plf = makePypeLocalFile(
        os.path.basename(config["input_fofn_fn"]))
    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
    make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf},
                                  outputs={"o_fofn": rawread_fofn_plf},
                                  parameters={},
                                  TaskType=PypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw)
    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done"))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, "rdb_build_done"))
        parameters = {"work_dir": rawread_dir, "config": config}

        make_build_rdb_task = PypeTask(
            inputs={"input_fofn": rawread_fofn_plf},
            outputs={"rdb_build_done": rdb_build_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase)

        build_rdb_task = make_build_rdb_task(build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        db_file = makePypeLocalFile(
            os.path.join(rawread_dir, "%s.db" % "raw_reads"))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(
            rawread_dir, "raw_reads", db_file, rdb_build_done, config)

        wf.addTasks(daligner_tasks)
        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs

        r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done"))

        @PypeTask(inputs=daligner_out,
                  outputs={"da_done": r_da_done},
                  TaskType=PypeThreadTaskBase,
                  URL="task://localhost/rda_check")
        def check_r_da_task(self):
            os.system("touch %s" % fn(self.da_done))

        wf.addTask(check_r_da_task)
        wf.refreshTargets(
            updateFreq=wait_time
        )  # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed

        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs,
                                               concurrent_jobs)
        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(
            rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks(merge_tasks)
        if config["target"] == "overlapping":
            wf.refreshTargets(
                updateFreq=wait_time
            )  # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
            sys.exit(0)
        wf.addTasks(consensus_tasks)

        r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done"))
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, "input_preads.fofn"))

        @PypeTask(inputs=consensus_out,
                  outputs={
                      "cns_done": r_cns_done,
                      "pread_fofn": pread_fofn
                  },
                  TaskType=PypeThreadTaskBase,
                  URL="task://localhost/cns_check")
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn), "w") as f:
                fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >> f, fa_fn
            os.system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)
        wf.refreshTargets(
            updateFreq=wait_time)  # larger number better for more jobs

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
        make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf},
                                      outputs={"o_fofn": pread_fofn},
                                      parameters={},
                                      TaskType=PypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, "pdb_build_done"))
    parameters = {"work_dir": pread_dir, "config": config}

    make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn},
                                   outputs={"pdb_build_done": pdb_build_done},
                                   parameters=parameters,
                                   TaskType=PypeThreadTaskBase,
                                   URL="task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads"))
    #### run daligner
    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks(pread_dir,
                                                         "preads",
                                                         db_file,
                                                         pdb_build_done,
                                                         config,
                                                         pread_aln=True)
    wf.addTasks(daligner_tasks)
    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs

    p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done"))

    @PypeTask(inputs=daligner_out,
              outputs={"da_done": p_da_done},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/pda_check")
    def check_p_da_task(self):
        os.system("touch %s" % fn(self.da_done))

    wf.addTask(check_p_da_task)

    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(
        pread_dir, "preads", p_da_done, config)
    wf.addTasks(merge_tasks)
    #wf.refreshTargets(updateFreq = 30) #all

    p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done"))

    @PypeTask(inputs=merge_out,
              outputs={"p_merge_done": p_merge_done},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/pmerge_check")
    def check_p_merge_check_task(self):
        os.system("touch %s" % fn(self.p_merge_done))

    wf.addTask(check_p_merge_check_task)
    wf.refreshTargets(updateFreq=wait_time)  #all

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, "falcon_asm_done"))

    @PypeTask(inputs={
        "p_merge_done": p_merge_done,
        "db_file": db_file
    },
              outputs={"falcon_asm_done": falcon_asm_done},
              parameters={
                  "wd": falcon_asm_dir,
                  "config": config,
                  "pread_dir": pread_dir
              },
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/falcon")
    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join(wd)
        script_fn = os.path.join(script_dir, "run_falcon_asm.sh")

        script = []
        script.append("set -vex")
        script.append("trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done))
        script.append("source {install_prefix}/bin/activate".format(
            install_prefix=install_prefix))
        script.append("cd %s" % pread_dir)
        # Write preads4falcon.fasta, in 1-preads_ovl:
        script.append("DB2Falcon -U preads")
        script.append("cd %s" % wd)
        script.append("""find %s/las_files -name "*.las" > las.fofn """ %
                      pread_dir)
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\
                (fn(db_file), overlap_filtering_setting, length_cutoff_pr) )
        script.append("ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append(
            """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log"""
            % length_cutoff_pr)  # TODO: drop this logfile
        # Write 'p_ctg.fa' and 'a_ctg.fa':
        script.append("""fc_graph_to_contig.py""")
        script.append("""touch %s""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_name = self.URL.split("/")[-1]
        job_name += "-" + str(uuid.uuid4())[:8]
        job_data = {
            "job_name": job_name,
            "cwd": wd,
            "sge_option": config["sge_option_fc"],
            "script_fn": script_fn
        }
        run_script(job_data, job_type=config["job_type"])
        wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_name)

    wf.addTask(run_falcon_asm_task)
    wf.refreshTargets(updateFreq=wait_time)  #all
Beispiel #23
0
def generate_read_to_contig_map(rawread_dir=rawread_dir,
                                pread_dir=pread_dir,
                                asm_dir=asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db"))
    rawread_id_file = makePypeLocalFile(
        os.path.join(rawread_dir, "raw_read_ids"))

    @PypeTask(inputs={"rawread_db": rawread_db},
              outputs={"rawread_id_file": rawread_id_file},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/dump_rawread_ids")
    def dump_rawread_ids(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" %
                  (rawread_db, rawread_id_file))

    wf.addTask(dump_rawread_ids)

    pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db"))
    pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids"))

    @PypeTask(inputs={"pread_db": pread_db},
              outputs={"pread_id_file": pread_id_file},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/dump_pread_ids")
    def dump_pread_ids(self):
        pread_db = fn(self.pread_db)
        pread_id_file = fn(self.pread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" %
                  (pread_db, pread_id_file))

    wf.addTask(dump_pread_ids)

    all_raw_las_files = {}
    for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")):
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_raw_las_files["r_las_%s" % idx] = las_file

    all_pread_las_files = {}
    for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")):
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_pread_las_files["p_las_%s" % idx] = las_file

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list"))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data"))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths"))

    inputs = {
        "rawread_id_file": rawread_id_file,
        "pread_id_file": pread_id_file,
        "sg_edges_list": sg_edges_list,
        "utg_data": utg_data,
        "ctg_paths": ctg_paths
    }

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, "read_to_contig_map"))

    @PypeTask(inputs=inputs,
              outputs={"read_to_contig_map": read_to_contig_map},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/get_ctg_read_map")
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data),
                         fn(self.ctg_paths))

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid) / 10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault(k, set())
                    pread_to_contigs[k].add(ctg)

            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask(generate_read_to_ctg_map)

    def dump_rawread_to_ctg(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        #pread_id_file = fn( self.pread_id_file )
        las_file = fn(self.las_file)
        rawread_to_contig_file = fn(self.rawread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        rid_to_oid = open(rawread_id_file).read().split("\n")
        #pread_did_to_rid = open(pread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        rid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                rid = int(rid)
                rid_to_contigs.setdefault(rid, (oid, set()))
                rid_to_contigs[rid][1].add(ctg)

        with open(rawread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            for row in sp.check_output(
                    shlex.split("LA4Falcon -m %s %s " %
                                (rawread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            o_id = rid_to_oid[cur_read_id]
                            print >> f, "%09d %s %s %d %d %d %d" % (
                                cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (
                                    q_id_, o_id, ctg, count, rank, score,
                                    in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id

                if q_id in rid_to_contigs and len(
                        ovlp_data) == 0:  #if the query is in some contig....
                    t_o_id, ctgs = rid_to_contigs[q_id]
                    o_id = rid_to_oid[q_id]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1

                if t_id not in rid_to_contigs:
                    continue

                t_o_id, ctgs = rid_to_contigs[t_id]
                o_id = rid_to_oid[q_id]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (
                        q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    def dump_pread_to_ctg(self):
        pread_db = fn(self.pread_db)
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        las_file = fn(self.las_file)
        pread_to_contig_file = fn(self.pread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pid_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        pid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                pid = int(pid)
                pid_to_contigs.setdefault(pid, (oid, set()))
                pid_to_contigs[pid][1].add(ctg)

        with open(pread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            skip_rest = 0
            for row in sp.check_output(
                    shlex.split("LA4Falcon -mo %s %s " %
                                (pread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            rid = pid_to_rid[cur_read_id].split("/")[1]
                            rid = int(int(rid) / 10)
                            o_id = rid_to_oid[rid]
                            print >> f, "%09d %s %s %d %d %d %d" % (
                                cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (
                                    q_id_, o_id, ctg, count, rank, score,
                                    in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id
                        skip_rest = 0

                if q_id in pid_to_contigs and len(
                        ovlp_data) == 0:  #if the query is in some contig....
                    t_o_id, ctgs = pid_to_contigs[q_id]
                    rid = pid_to_rid[q_id].split("/")[1]
                    rid = int(int(rid) / 10)
                    o_id = rid_to_oid[rid]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1
                    skip_rest = 1

                if skip_rest == 1:
                    continue

                if t_id not in pid_to_contigs:
                    continue

                t_o_id, ctgs = pid_to_contigs[t_id]
                rid = pid_to_rid[q_id].split("/")[1]
                rid = int(int(rid) / 10)
                o_id = rid_to_oid[rid]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (
                        q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    for las_key, las_file in all_raw_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        rawread_to_contig_file = makePypeLocalFile(
            os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
        make_dump_rawread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "rawread_db": rawread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file
            },
            outputs={"rawread_to_contig_file": rawread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/r_read_to_contigs.%s" % idx)
        dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(
            dump_rawread_to_ctg)
        wf.addTask(dump_rawread_to_ctg_task)

    for las_key, las_file in all_pread_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[
            -1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        pread_to_contig_file = makePypeLocalFile(
            os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
        make_dump_pread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "pread_db": pread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file
            },
            outputs={"pread_to_contig_file": pread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/pread_to_contigs.%s" % idx)
        dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
        wf.addTask(dump_pread_to_ctg_task)

    wf.refreshTargets()  # block
Beispiel #24
0
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (dist_map_dir, fasta_dir, pa_dir, script_dir, celera_asm_dir,  sge_log_dir):
        try:
            os.makedirs(d)
        except:
            pass

    config = get_config(sys.argv[1])
    concurrent_jobs = config["concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()


    #### Task to convert bas.h5 and bax.h5 to fasta files, it will generates two fofn files for the queries and targets
    input_h5_fofn = makePypeLocalFile(os.path.abspath( config["input_fofn_fn"] ))
    query_fa_fofn = makePypeLocalFile( os.path.join( fasta_dir, "queries.fofn" ) )
    target_fa_fofn = makePypeLocalFile( os.path.join( fasta_dir, "targets.fofn" ) )
    fasta_dump_done = makePypeLocalFile(os.path.abspath( os.path.join( fasta_dir, "fasta_dump_done") ) )
    parameters = {"fasta_dir": fasta_dir,
                  "min_length": config["length_cutoff"],
                  "min_read_score": config["RQ_threshold"]}

    @PypeTask(inputs = {"input_fofn": input_h5_fofn},
              outputs = {"fasta_dump_done": fasta_dump_done, 
                         "target_fa_fofn": target_fa_fofn,
                         "query_fa_fofn":  query_fa_fofn},
              parameters = parameters,
              TaskType = PypeThreadTaskBase)
    def h5fofn_to_fasta(self):
        os.system("h5fofn_to_fasta.py %s %s --min_length 500 --min_seed_length %d --min_read_score %f" %\
Beispiel #25
0
def main(argv=sys.argv):

    global fc_run_logger
    fc_run_logger = support.setup_logger(None)

    if len(sys.argv) < 2:
        print "you need to provide a configuration file to specific a couple cluster running environment"
        sys.exit(1)

    config_fn = sys.argv[1]

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = "SGE"
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = " -pe smp 12 -q bigmem"
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = " -pe smp 24 -q bigmem "
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = "/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/"
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = "input_bam.fofn"
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        "job_type": job_type,
        "sge_quiver": sge_quiver,
        "sge_track_reads": sge_track_reads,
        "input_bam_fofn": input_bam_fofn,
        "smrt_bin": smrt_bin
    }

    support.job_type = "SGE"  #tmp hack until we have a configuration parser

    ctg_ids = []

    PypeThreadWorkflow.setNumThreadAllowed(quiver_concurrent_jobs,
                                           quiver_concurrent_jobs)
    wf = PypeThreadWorkflow()

    parameters = {"wd": os.path.abspath("."), "config": config}
    hasm_done = makePypeLocalFile("./3-unzip/1-hasm/hasm_done")
    job_done = makePypeLocalFile(
        os.path.join(parameters["wd"], "track_reads_h_done"))
    make_track_reads_task = PypeTask(inputs={"hasm_done": hasm_done},
                                     outputs={"job_done": job_done},
                                     parameters=parameters,
                                     TaskType=PypeThreadTaskBase,
                                     URL="task://localhost/track_reads_h")
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ref_seq_data = {}
    p_ctg_fa = FastaReader("./3-unzip/all_p_ctg.fa")
    ctg_types = {}
    for r in p_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "p"

    h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa")
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "h"

    ctg_ids = sorted(ref_seq_data.keys())
    p_ctg_out = []
    h_ctg_out = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split("-")[0]
        wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id)
        mkdir(wd)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), "./4-quiver/reads/"
                "{ctg_id}.sam".format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id)))

        if os.path.exists(fn(read_sam)):
            if ctg_types[ctg_id] == "p":
                p_ctg_out.append((cns_fasta, cns_fastq))
            if ctg_types[ctg_id] == "h":
                h_ctg_out.append((cns_fasta, cns_fastq))
            if not os.path.exists(fn(ref_fasta)):
                with open(fn(ref_fasta), "w") as f:
                    print >> f, ">" + ctg_id
                    print >> f, sequence
            parameters = {
                "job_uid": "q-" + ctg_id,
                "wd": wd,
                "config": config,
                "ctg_id": ctg_id
            }
            make_quiver_task = PypeTask(
                inputs={
                    "ref_fasta": ref_fasta,
                    "read_sam": read_sam
                },
                outputs={
                    "cns_fasta": cns_fasta,
                    "cns_fastq": cns_fastq,
                    "job_done": job_done
                },
                parameters=parameters,
                TaskType=PypeThreadTaskBase,
                URL="task://localhost/q_{ctg_id}".format(ctg_id=ctg_id))
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)

    wf.refreshTargets()
    os.system("sleep 30")

    mkdir("./4-quiver/cns_output")
    os.system("rm ./4-quiver/cns_output/cns_p_ctg.fasta")
    os.system("rm ./4-quiver/cns_output/cns_p_ctg.fastq")
    for cns_fasta, cns_fastq in sorted(p_ctg_out):
        os.system(
            "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_p_ctg.fasta".format(
                cns_fasta=fn(cns_fasta)))
        os.system(
            "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_p_ctg.fastq".format(
                cns_fastq=fn(cns_fastq)))

    os.system("rm ./4-quiver/cns_output/cns_h_ctg.fasta")
    os.system("rm ./4-quiver/cns_output/cns_h_ctg.fastq")
    for cns_fasta, cns_fastq in sorted(h_ctg_out):
        os.system(
            "zcat {cns_fasta} >> ./4-quiver/cns_output/cns_h_ctg.fasta".format(
                cns_fasta=fn(cns_fasta)))
        os.system(
            "zcat {cns_fastq} >> ./4-quiver/cns_output/cns_h_ctg.fastq".format(
                cns_fastq=fn(cns_fastq)))
Beispiel #26
0
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (dist_map_dir, fasta_dir, pa_dir, script_dir, celera_asm_dir,
              sge_log_dir):
        try:
            os.makedirs(d)
        except:
            pass

    config = get_config(sys.argv[1])
    concurrent_jobs = config["concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    #### Task to convert bas.h5 and bax.h5 to fasta files, it will generates two fofn files for the queries and targets
    input_h5_fofn = makePypeLocalFile(os.path.abspath(config["input_fofn_fn"]))
    query_fa_fofn = makePypeLocalFile(os.path.join(fasta_dir, "queries.fofn"))
    target_fa_fofn = makePypeLocalFile(os.path.join(fasta_dir, "targets.fofn"))
    fasta_dump_done = makePypeLocalFile(
        os.path.abspath(os.path.join(fasta_dir, "fasta_dump_done")))
    parameters = {
        "fasta_dir": fasta_dir,
        "min_length": config["length_cutoff"],
        "min_read_score": config["RQ_threshold"]
    }

    @PypeTask(inputs={"input_fofn": input_h5_fofn},
              outputs={
                  "fasta_dump_done": fasta_dump_done,
                  "target_fa_fofn": target_fa_fofn,
                  "query_fa_fofn": query_fa_fofn
            pass
        try:
            os.makedirs("%s/las_files" % (wd) )
        except OSError:
            pass

        with open("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id), "w") as merge_script:
            #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, 
p_id, db_prefix)
            for l in s_data:
                print >> merge_script, l
            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) 
            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) 
            
        merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
        job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
        parameters =  {"merge_script": merge_script_file, 
                       "cwd": os.path.join(wd, "m_%05d" % p_id),
                       "job_id": p_id,
                       "config": config}

        make_merge_task = PypeTask( inputs = {"input_dep": input_dep},
                                       outputs = {"job_done": job_done},
                                       parameters = parameters,
                                       TaskType = PypeThreadTaskBase,
                                       URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) )
        merge_task = make_merge_task ( run_merge_task )

        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)
Beispiel #28
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    config = support.get_dict_from_old_falcon_cfg(
        support.parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure = config[
        'stop_all_jobs_on_failure']  # only matter for parallel jobs
    concurrent_jobs = config["pa_concurrent_jobs"]
    Workflow = PypeProcWatcherWorkflow
    PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs,
                                                concurrent_jobs)
    wf = PypeProcWatcherWorkflow(job_type=config['job_type'])

    input_fofn_plf = makePypeLocalFile(config["input_fofn"])
    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, os.path.basename(config["input_fofn"])))
    make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf},
                                  outputs={"o_fofn": rawread_fofn_plf},
                                  parameters={},
                                  TaskType=MyFakePypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done"))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, "rdb_build_done"))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh"))
        parameters = {"work_dir": rawread_dir, "config": config}

        raw_reads_db = makePypeLocalFile(
            os.path.join(rawread_dir, "%s.db" % "raw_reads"))
        make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf},
                                       outputs={
                                           "rdb_build_done": rdb_build_done,
                                           "raw_reads_db": raw_reads_db,
                                           "run_jobs": run_jobs,
                                       },
                                       parameters=parameters,
                                       TaskType=MyFakePypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config)

        wf.addTasks(daligner_tasks)
        r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done"))

        parameters = {
            "nblock": raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(inputs=daligner_out,
                                        outputs={"da_done": r_da_done},
                                        parameters=parameters,
                                        TaskType=MyFakePypeThreadTaskBase,
                                        URL="task://localhost/rda_check")
        check_r_da_task = make_daligner_gather(task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks(merge_tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config["target"] == "overlapping":
            sys.exit(0)
        consensus_tasks, consensus_out = create_consensus_tasks(
            rawread_dir, "raw_reads", config, p_ids_merge_job_done)
        wf.addTasks(consensus_tasks)

        r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done"))
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, "input_preads.fofn"))

        @PypeTask(inputs=consensus_out,
                  outputs={
                      "cns_done": r_cns_done,
                      "pread_fofn": pread_fofn
                  },
                  TaskType=MyFakePypeThreadTaskBase,
                  URL="task://localhost/cns_check")
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn), "w") as f:
                fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >> f, fa_fn
            system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)

        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs,
                                                    concurrent_jobs)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, os.path.basename(config["input_fofn"])))
        make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf},
                                      outputs={"o_fofn": pread_fofn},
                                      parameters={},
                                      TaskType=MyFakePypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, "pdb_build_done"))
    parameters = {"work_dir": pread_dir, "config": config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(
        pread_dir, 'preads.db'))  # Also .preads.*, of course.
    make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn},
                                   outputs={
                                       "pdb_build_done": pdb_build_done,
                                       "preads_db": preads_db,
                                       "run_jobs": run_jobs
                                   },
                                   parameters=parameters,
                                   TaskType=MyFakePypeThreadTaskBase,
                                   URL="task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs),
                                                         pread_dir,
                                                         "preads",
                                                         pdb_build_done,
                                                         config,
                                                         pread_aln=True)
    wf.addTasks(daligner_tasks)

    p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done"))
    parameters = {
        "nblock": preads_nblock,
    }
    make_daligner_gather = PypeTask(inputs=daligner_out,
                                    outputs={"da_done": p_da_done},
                                    parameters=parameters,
                                    TaskType=MyFakePypeThreadTaskBase,
                                    URL="task://localhost/pda_check")
    check_p_da_task = make_daligner_gather(task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir,
                                                   "preads", p_da_done, config)
    wf.addTasks(merge_tasks)

    p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done"))

    @PypeTask(inputs=merge_out,
              outputs={"p_merge_done": p_merge_done},
              TaskType=MyFakePypeThreadTaskBase,
              URL="task://localhost/pmerge_check")
    def check_p_merge_check_task(self):
        system("touch %s" % fn(self.p_merge_done))

    wf.addTask(check_p_merge_check_task)

    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeProcWatcherWorkflow.setNumThreadAllowed(concurrent_jobs,
                                                concurrent_jobs)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    db2falcon_done = makePypeLocalFile(
        os.path.join(pread_dir, "db2falcon_done"))
    make_run_db2falcon = PypeTask(inputs={
        "p_merge_done": p_merge_done,
    },
                                  outputs={"db2falcon_done": db2falcon_done},
                                  parameters={
                                      "wd": pread_dir,
                                      "config": config,
                                  },
                                  TaskType=MyFakePypeThreadTaskBase,
                                  URL="task://localhost/db2falcon")
    wf.addTask(make_run_db2falcon(task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, "falcon_asm_done"))
    make_run_falcon_asm = PypeTask(
        inputs={
            "db2falcon_done": db2falcon_done,
            "db_file": preads_db
        },
        outputs={"falcon_asm_done": falcon_asm_done},
        parameters={
            "wd": falcon_asm_dir,
            "config": config,
            "pread_dir": pread_dir
        },
        TaskType=MyFakePypeThreadTaskBase,
        URL="task://localhost/falcon")
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets()
Beispiel #29
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    config = support.get_dict_from_old_falcon_cfg(support.parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    input_fofn_plf = makePypeLocalFile(config["input_fofn"])
    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"])))
    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
                                  outputs = {"o_fofn": rawread_fofn_plf},
                                  parameters = {},
                                  TaskType = PypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) 
        parameters = {"work_dir": rawread_dir,
                      "config": config}

        raw_reads_db = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
                                      outputs = {"rdb_build_done": rdb_build_done,
                                                 "raw_reads.db": raw_reads_db,
                                                 "run_jobs": run_jobs}, 
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done]) 

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config) 

        wf.addTasks(daligner_tasks)
        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )

        parameters =  {
                "nblock": raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
                   inputs = daligner_out, 
                   outputs =  {"da_done":r_da_done},
                   parameters = parameters,
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/rda_check" )
        check_r_da_task = make_daligner_gather(task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)
        
        merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks( merge_tasks )
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config["target"] == "overlapping":
            sys.exit(0)
        consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done)
        wf.addTasks( consensus_tasks )

        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

        @PypeTask( inputs = consensus_out, 
                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/cns_check" )
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn),  "w") as f:
                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >>f, fa_fn
            system("touch %s" % fn(self.cns_done))
        wf.addTask(check_r_cns_task)

        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"])))
        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
                                     outputs = {"o_fofn": pread_fofn},
                                     parameters = {},
                                     TaskType = PypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) 
    parameters = {"work_dir": pread_dir,
                  "config": config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course.
    make_build_pdb_task  = PypeTask(inputs = {"pread_fofn": pread_fofn },
                                    outputs = {"pdb_build_done": pdb_build_done,
                                               "preads_db": preads_db,
                                               "run_jobs": run_jobs},
                                    parameters = parameters,
                                    TaskType = PypeThreadTaskBase,
                                    URL = "task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done]) 


    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True)
    wf.addTasks(daligner_tasks)

    p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done"))
    parameters =  {
            "nblock": preads_nblock,
    }
    make_daligner_gather = PypeTask(
                inputs = daligner_out, 
                outputs =  {"da_done":p_da_done},
                parameters = parameters,
                TaskType = PypeThreadTaskBase,
                URL = "task://localhost/pda_check" )
    check_p_da_task = make_daligner_gather(task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config)
    wf.addTasks( merge_tasks )

    p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done"))

    @PypeTask( inputs = merge_out, 
               outputs =  {"p_merge_done": p_merge_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pmerge_check" )
    def check_p_merge_check_task(self):
        system("touch %s" % fn(self.p_merge_done))
    wf.addTask(check_p_merge_check_task)

    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    
    db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done"))
    make_run_db2falcon = PypeTask(
               inputs = {"p_merge_done": p_merge_done,},
               outputs =  {"db2falcon_done": db2falcon_done},
               parameters = {"wd": pread_dir,
                             "config": config,
                            },
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/db2falcon" )
    wf.addTask(make_run_db2falcon(task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
    make_run_falcon_asm = PypeTask(
               inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db},
               outputs =  {"falcon_asm_done": falcon_asm_done},
               parameters = {"wd": falcon_asm_dir,
                             "config": config,
                             "pread_dir": pread_dir},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/falcon" )
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets()
Beispiel #30
0
def create_merge_tasks(wd, db_prefix, input_dep, config):
    merge_tasks = []
    consensus_tasks = []
    merge_out = {}
    consensus_out ={}
    mjob_data = {}

    with open(os.path.join(wd,  "run_jobs.sh")) as f :
        for l in f:
            l = l.strip().split()
            if l[0] not in ( "LAsort", "LAmerge", "mv" ):
                continue
            if l[0] == "LAsort":
                p_id = int( l[2].split(".")[1] )
                mjob_data.setdefault( p_id, [] )
                mjob_data[p_id].append(  " ".join(l) )
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1][0] == "L":
                    p_id = int(  l[2].split(".")[2] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )
                else:
                    p_id = int( l[2].split(".")[1] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )
            if l[0] == "mv":
                l2 = l[1].split(".")
                if l2[1][0] == "L":
                    p_id = int(  l[1].split(".")[2] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )
                else:
                    p_id = int( l[1].split(".")[1] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        try:
            os.makedirs("%s/m_%05d" % (wd, p_id))
        except OSError:
            pass
        try:
            os.makedirs("%s/preads" % (wd) )
        except OSError:
            pass
        try:
            os.makedirs("%s/las_files" % (wd) )
        except OSError:
            pass

        with open("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id), "w") as merge_script:
            #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix)
            for l in s_data:
                print >> merge_script, l
            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id) 
            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id) 
            
        merge_script_file = os.path.abspath( "%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id) )
        job_done = makePypeLocalFile(os.path.abspath( "%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)  ))
        parameters =  {"merge_script": merge_script_file, 
                       "cwd": os.path.join(wd, "m_%05d" % p_id),
                       "job_id": p_id,
                       "config": config}

        make_merge_task = PypeTask( inputs = {"input_dep": input_dep},
                                       outputs = {"job_done": job_done},
                                       parameters = parameters,
                                       TaskType = PypeThreadTaskBase,
                                       URL = "task://localhost/m_%05d_%s" % (p_id, db_prefix) )
        merge_task = make_merge_task ( run_merge_task )

        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)


        out_file = makePypeLocalFile(os.path.abspath( "%s/preads/out.%05d.fa" % (wd, p_id)  ))
        out_done = makePypeLocalFile(os.path.abspath( "%s/preads/c_%05d_done" % (wd, p_id)  ))
        parameters =  {"cwd": os.path.join(wd, "preads" ),
                       "job_id": p_id, 
                       "prefix": db_prefix,
                       "config": config}
        make_c_task = PypeTask( inputs = {"job_done": job_done},
                                outputs = {"out_file": out_file, "out_done": out_done },
                                parameters = parameters,
                                TaskType = PypeThreadTaskBase,
                                URL = "task://localhost/ct_%05d" % p_id )
        
        c_task = make_c_task( run_consensus_task )
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done 

    return merge_tasks, merge_out, consensus_tasks, consensus_out
Beispiel #31
0
def unzip_all(config):
    unzip_concurrent_jobs = config["unzip_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(unzip_concurrent_jobs,
                                           unzip_concurrent_jobs)
    wf = PypeThreadWorkflow()

    ctg_list_file = makePypeLocalFile("./3-unzip/reads/ctg_list")
    falcon_asm_done = makePypeLocalFile("./2-asm-falcon/falcon_asm_done")
    parameters = {"wd": os.path.abspath("."), "config": config}

    job_done = makePypeLocalFile(
        os.path.join(parameters["wd"], "track_reads_done"))
    make_track_reads_task = PypeTask(
        inputs={"falcon_asm_done": falcon_asm_done},
        outputs={
            "job_done": job_done,
            "ctg_list_file": ctg_list_file
        },
        parameters=parameters,
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/track_reads")
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ctg_ids = []
    with open("./3-unzip/reads/ctg_list") as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            "./3-unzip/reads/{ctg_id}_ref.fa".format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            "./3-unzip/reads/{ctg_id}_reads.fa".format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), "./3-unzip/0-phasing/{ctg_id}/".format(ctg_id=ctg_id))
        mkdir(wd)
        ctg_aln_out = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_sorted.bam".format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, "aln_{ctg_id}_done".format(ctg_id=ctg_id)))

        parameters = {
            "job_uid": "aln-" + ctg_id,
            "wd": wd,
            "config": config,
            "ctg_id": ctg_id
        }
        make_blasr_task = PypeTask(
            inputs={
                "ref_fasta": ref_fasta,
                "read_fasta": read_fasta
            },
            outputs={
                "ctg_aln_out": ctg_aln_out,
                "job_done": job_done
            },
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/aln_{ctg_id}".format(ctg_id=ctg_id))
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)

        job_done = makePypeLocalFile(
            os.path.join(wd, "p_{ctg_id}_done".format(ctg_id=ctg_id)))
        rid_to_phase_out = makePypeLocalFile(
            os.path.join(wd, "rid_to_phase.{ctg_id}".format(ctg_id=ctg_id)))
        all_ctg_out["r2p.{ctg_id}".format(ctg_id=ctg_id)] = rid_to_phase_out

        parameters = {
            "job_uid": "ha-" + ctg_id,
            "wd": wd,
            "config": config,
            "ctg_id": ctg_id
        }
        make_phasing_task = PypeTask(
            inputs={
                "ref_fasta": ref_fasta,
                "aln_bam": ctg_aln_out
            },
            outputs={"job_done": job_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/p_{ctg_id}".format(ctg_id=ctg_id))
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)

    wf.refreshTargets()

    hasm_wd = os.path.abspath("./3-unzip/1-hasm/")
    mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(
        os.path.join(hasm_wd, "rid_to_phase.all"))

    @PypeTask(inputs=all_ctg_out,
              outputs={"rid_to_phase_all": rid_to_phase_all},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/rid_to_phase_all")
    def get_rid_to_phase_all(self):
        rid_to_phase_all_fn = fn(self.rid_to_phase_all)
        inputs_fn = [fn(f) for f in self.inputs.values()]
        inputs_fn.sort()
        output = []
        for fname in inputs_fn:
            output.extend(open(fname).read())

        out = open(rid_to_phase_all_fn, "w")
        out.write("".join(output))
        out.close()

    wf.addTask(get_rid_to_phase_all)

    parameters["wd"] = hasm_wd
    job_done = makePypeLocalFile(os.path.join(hasm_wd, "hasm_done"))
    make_hasm_task = PypeTask(inputs={"rid_to_phase_all": rid_to_phase_all},
                              outputs={"job_done": job_done},
                              parameters=parameters,
                              TaskType=PypeThreadTaskBase,
                              URL="task://localhost/hasm")
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()
Beispiel #32
0
def main(*argv):
    setup_logger()
    if len(argv) < 2:
        print "you need to specify a configuration file"
        print "example: HGAP.py HGAP_run.cfg"
        sys.exit(1)
    
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        try:
            os.makedirs(d)
        except:
            pass

    config = get_config(argv[1])
    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
        parameters = {"work_dir": rawread_dir,
                      "config": config}

        make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn},
                                      outputs = {"rdb_build_done": rdb_build_done}, 
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase)

        buid_rdb_task = make_buid_rdb_task(build_rdb)

        wf.addTasks([buid_rdb_task])
        wf.refreshTargets([rdb_build_done]) 
        

        db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) 

        wf.addTasks(daligner_tasks)
        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs

        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )

        @PypeTask( inputs = daligner_out, 
                   outputs =  {"da_done":r_da_done},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/rda_check" )
        def check_r_da_task(self):
            os.system("touch %s" % fn(self.da_done))
        
        wf.addTask(check_r_da_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
        
        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config )
        wf.addTasks( merge_tasks )
        if config["target"] == "overlapping":
            wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
            exit(0)
        wf.addTasks( consensus_tasks )

        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

        @PypeTask( inputs = consensus_out, 
                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/cns_check" )
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn),  "w") as f:
                fn_list =  glob.glob("%s/preads/out*.fa" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >>f, fa_fn
            os.system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs

    if config["target"] == "pre-assembly":
        exit(0)
    
    if config["input_type"] == "preads":
        if not os.path.exists( "%s/input_preads.fofn" % pread_dir):
            os.system( "cp %s %s/input_preads.fofn" % (os.path.abspath( config["input_fofn_fn"] ), pread_dir) )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

    rdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "rdb_build_done") ) 
    @PypeTask( inputs = { "pread_fofn": pread_fofn },
               outputs = { "rdb_build_done": rdb_build_done },
               parameters = {"config": config, "pread_dir": pread_dir},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/build_p_rdb")
    def build_p_rdb_task(self):
        config = self.parameters["config"]
        pread_dir = self.parameters["pread_dir"]
        fa_serial = 0
        for fa_fn in open(fn(self.pread_fofn)).readlines():
            fa_fn = fa_fn.strip()
            c = 0
            fa_serial += 1
            with open("%s/preads_norm_%05d.fasta" % (pread_dir, fa_serial), "w") as p_norm:
                f = FastaReader(fa_fn)
                for r in f:
                    if len(r.sequence) < config["length_cutoff_pr"]:
                        continue
                    name = r.name
                    name = name.replace("_","")
                    ignore_read = False
                    for  cc in r.sequence:
                        if cc not in ["A","C","G","T"]:
                            ignore_read = True
                            break
                    if ignore_read:
                        continue
                    print >> p_norm, ">prolog_%05d/%d/%d_%d" % (fa_serial, c, 0, len(r.sequence) )
                    for i in range(0, len(r.sequence)/80):
                        print >> p_norm, r.sequence[ i *80 : (i + 1) * 80]
                    print >> p_norm, r.sequence[(i+1)*80:]
                    c += 1
            os.system("cd %s; fasta2DB preads preads_norm_%05d.fasta" % (pread_dir, fa_serial) )

        os.system("cd %s; DBsplit %s preads" % (pread_dir, config["ovlp_DBsplit_option"]))
        os.system("cd %s; HPCdaligner %s preads > run_jobs.sh" % (pread_dir, config["ovlp_HPCdaligner_option"]))
        os.system("cd %s; touch rdb_build_done" % pread_dir)

    wf.addTask(build_p_rdb_task)
    wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs

    db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" ))
    #### run daligner
    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, rdb_build_done, config, pread_aln= True) 
    wf.addTasks(daligner_tasks)
    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs

    p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") )

    @PypeTask( inputs = daligner_out, 
               outputs =  {"da_done":p_da_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pda_check" )
    def check_p_da_task(self):
        os.system("touch %s" % fn(self.da_done))
    
    wf.addTask(check_p_da_task)

    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config )
    wf.addTasks( merge_tasks )
    #wf.refreshTargets(updateFreq = 30) #all            

    p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") )

    @PypeTask( inputs = merge_out, 
               outputs =  {"p_merge_done":p_merge_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pmerge_check" )
    def check_p_merge_check_task(self):
        os.system("touch %s" % fn(self.p_merge_done))
    
    wf.addTask(check_p_merge_check_task)
    wf.refreshTargets(updateFreq = wait_time) #all            

    
    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
    @PypeTask( inputs = {"p_merge_done": p_merge_done}, 
               outputs =  {"falcon_asm_done":falcon_asm_done},
               parameters = {"wd": falcon_asm_dir,
                             "config": config,
                             "pread_dir": pread_dir},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/falcon" )
    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join( wd )
        script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
        
        script = []
        script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
        script.append( "cd %s" % pread_dir )
        script.append( "DB2Falcon preads")
        script.append( "cd %s" % wd )
        script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --fofn las.fofn %s \
                                 --n_core 24 --min_len %d > preads.ovl""" % (overlap_filtering_setting, length_cutoff_pr) )

        script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append( """fc_ovlp_to_graph.py preads.ovl > fc.log""" )
        script.append( """fc_graph_to_contig.py""" )
        script.append( """touch %s\n""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_name = self.URL.split("/")[-1]
        job_name += "-"+str(uuid.uuid1())[:8]
        job_data = {"job_name": job_name,
                    "cwd": wd,
                    "sge_option": config["sge_option_fc"],
                    "script_fn": script_fn }
        run_script(job_data, job_type = "SGE")
        wait_for_file( fn(self.falcon_asm_done), task=self, job_name=job_name )
    
    wf.addTask( run_falcon_asm_task )
    wf.refreshTargets(updateFreq = wait_time) #all            
Beispiel #33
0
def run(wf, config,
        input_fofn_plf,
        setNumThreadAllowed,
        ):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    concurrent_jobs = config["pa_concurrent_jobs"]
    setNumThreadAllowed(concurrent_jobs, concurrent_jobs)

    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn"])))
    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
                                  outputs = {"o_fofn": rawread_fofn_plf},
                                  parameters = {},
                                  TaskType = MyFakePypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") )
        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") )
        parameters = {"work_dir": rawread_dir,
                      "sge_option": config["sge_option_da"],
                      "config": config}

        length_cutoff_plf = makePypeLocalFile(os.path.join(rawread_dir, "length_cutoff"))
        raw_reads_db_plf = makePypeLocalFile(os.path.join(rawread_dir, "%s.db" % "raw_reads"))
        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
                                      outputs = {"rdb_build_done": rdb_build_done,
                                                 "raw_reads_db": raw_reads_db_plf,
                                                 "length_cutoff": length_cutoff_plf,
                                                 "run_jobs": run_jobs,
                                      },
                                      parameters = parameters,
                                      TaskType = MyFakePypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config)

        wf.addTasks(daligner_tasks)
        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )

        parameters =  {
                "nblock": raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
                   inputs = daligner_out,
                   outputs =  {"da_done":r_da_done},
                   parameters = parameters,
                   TaskType = MyFakePypeThreadTaskBase,
                   URL = "task://localhost/rda_check" )
        check_r_da_task = make_daligner_gather(task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks( merge_tasks )
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config["target"] == "overlapping":
            sys.exit(0)
        consensus_tasks, consensus_out = create_consensus_tasks(rawread_dir, "raw_reads", config, p_ids_merge_job_done)
        wf.addTasks( consensus_tasks )

        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

        @PypeTask( inputs = consensus_out,
                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
                   TaskType = MyFakePypeThreadTaskBase,
                   URL = "task://localhost/cns_check" )
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn),  "w") as f:
                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >>f, fa_fn
            system("touch %s" % fn(self.cns_done))
        wf.addTask(check_r_cns_task)

        pre_assembly_report_plf = makePypeLocalFile(os.path.join(rawread_dir, "pre_assembly_stats.json")) #tho technically it needs pread_fofn
        make_task = PypeTask(
                inputs = {"length_cutoff_fn": length_cutoff_plf,
                          "raw_reads_db": raw_reads_db_plf,
                          "preads_fofn": pread_fofn, },
                outputs = {"pre_assembly_report": pre_assembly_report_plf, },
                parameters = config,
                TaskType = MyFakePypeThreadTaskBase,
                URL = "task://localhost/report_pre_assembly")
        task = make_task(task_report_pre_assembly)
        wf.addTask(task)

        concurrent_jobs = config["cns_concurrent_jobs"]
        setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        wf.refreshTargets(exitOnFailure=exitOnFailure)


    if config["target"] == "pre-assembly":
        log.info("Quitting after stage-0 for 'pre-assembly' target.")
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn"])))
        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
                                     outputs = {"o_fofn": pread_fofn},
                                     parameters = {},
                                     TaskType = MyFakePypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") )
    parameters = {"work_dir": pread_dir,
                  "sge_option": config["sge_option_pda"],
                  "config": config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db')) # Also .preads.*, of course.
    make_build_pdb_task  = PypeTask(inputs = {"pread_fofn": pread_fofn },
                                    outputs = {"pdb_build_done": pdb_build_done,
                                               "preads_db": preads_db,
                                               "run_jobs": run_jobs,
                                    },
                                    parameters = parameters,
                                    TaskType = MyFakePypeThreadTaskBase,
                                    URL = "task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])


    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config["sge_option_da"] = config["sge_option_pda"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", pdb_build_done, config, pread_aln=True)
    wf.addTasks(daligner_tasks)

    p_da_done = makePypeLocalFile(os.path.join( pread_dir, "da_done"))
    parameters =  {
            "nblock": preads_nblock,
    }
    make_daligner_gather = PypeTask(
                inputs = daligner_out,
                outputs =  {"da_done":p_da_done},
                parameters = parameters,
                TaskType = MyFakePypeThreadTaskBase,
                URL = "task://localhost/pda_check" )
    check_p_da_task = make_daligner_gather(task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    config["sge_option_la"] = config["sge_option_pla"]
    merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config)
    wf.addTasks( merge_tasks )

    p_merge_done = makePypeLocalFile(os.path.join( pread_dir, "p_merge_done"))

    @PypeTask( inputs = merge_out,
               outputs =  {"p_merge_done": p_merge_done},
               TaskType = MyFakePypeThreadTaskBase,
               URL = "task://localhost/pmerge_check" )
    def check_p_merge_check_task(self):
        system("touch %s" % fn(self.p_merge_done))
    wf.addTask(check_p_merge_check_task)

    concurrent_jobs = config["ovlp_concurrent_jobs"]
    setNumThreadAllowed(concurrent_jobs, concurrent_jobs)

    wf.refreshTargets(exitOnFailure=exitOnFailure)


    db2falcon_done = makePypeLocalFile( os.path.join(pread_dir, "db2falcon_done"))
    make_run_db2falcon = PypeTask(
               inputs = {"p_merge_done": p_merge_done,},
               outputs =  {"db2falcon_done": db2falcon_done},
               parameters = {"wd": pread_dir,
                             "config": config,
                             "sge_option": config["sge_option_fc"],
                            },
               TaskType = MyFakePypeThreadTaskBase,
               URL = "task://localhost/db2falcon" )
    wf.addTask(make_run_db2falcon(task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
    make_run_falcon_asm = PypeTask(
               inputs = {"db2falcon_done": db2falcon_done, "db_file": preads_db},
               outputs =  {"falcon_asm_done": falcon_asm_done},
               parameters = {"wd": falcon_asm_dir,
                             "config": config,
                             "pread_dir": pread_dir,
                             "sge_option": config["sge_option_fc"],
               },
               TaskType = MyFakePypeThreadTaskBase,
               URL = "task://localhost/falcon_asm" )
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
Beispiel #34
0
    h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa")
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "h"

    ctg_ids = sorted(ref_seq_data.keys())
    p_ctg_out = []
    h_ctg_out = []
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split("-")[0]
        wd = os.path.join(os.getcwd(), "./4-quiver/", m_ctg_id)
        mkdir(wd)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), "./4-quiver/reads/"
                "{ctg_id}.sam".format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id=ctg_id)))

        if os.path.exists(fn(read_sam)):
            if ctg_types[ctg_id] == "p":
                p_ctg_out.append((cns_fasta, cns_fastq))
            if ctg_types[ctg_id] == "h":
Beispiel #35
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info("fc_run started with configuration %s", input_config_fn)
    config = support.get_dict_from_old_falcon_cfg(
        support.parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    input_fofn_plf = makePypeLocalFile(
        os.path.basename(config["input_fofn_fn"]))
    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
    make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf},
                                  outputs={"o_fofn": rawread_fofn_plf},
                                  parameters={},
                                  TaskType=PypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done"))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, "rdb_build_done"))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh"))
        parameters = {"work_dir": rawread_dir, "config": config}

        make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf},
                                       outputs={
                                           "rdb_build_done": rdb_build_done,
                                           "run_jobs": run_jobs
                                       },
                                       parameters=parameters,
                                       TaskType=PypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        db_file = makePypeLocalFile(
            os.path.join(rawread_dir, "%s.db" % "raw_reads"))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done,
            config)

        wf.addTasks(daligner_tasks)
        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs

        r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done"))

        @PypeTask(inputs=daligner_out,
                  outputs={"da_done": r_da_done},
                  TaskType=PypeThreadTaskBase,
                  URL="task://localhost/rda_check")
        def check_r_da_task(self):
            system("touch %s" % fn(self.da_done))

        wf.addTask(check_r_da_task)
        wf.refreshTargets(
            updateFreq=wait_time
        )  # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed

        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs,
                                               concurrent_jobs)
        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks(merge_tasks)
        if config["target"] == "overlapping":
            wf.refreshTargets(
                updateFreq=wait_time
            )  # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
            sys.exit(0)
        wf.addTasks(consensus_tasks)

        r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done"))
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, "input_preads.fofn"))

        @PypeTask(inputs=consensus_out,
                  outputs={
                      "cns_done": r_cns_done,
                      "pread_fofn": pread_fofn
                  },
                  TaskType=PypeThreadTaskBase,
                  URL="task://localhost/cns_check")
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn), "w") as f:
                fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >> f, fa_fn
            system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)
        wf.refreshTargets(
            updateFreq=wait_time)  # larger number better for more jobs

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
        make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf},
                                      outputs={"o_fofn": pread_fofn},
                                      parameters={},
                                      TaskType=PypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, "pdb_build_done"))
    parameters = {"work_dir": pread_dir, "config": config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn},
                                   outputs={
                                       "pdb_build_done": pdb_build_done,
                                       "run_jobs": run_jobs
                                   },
                                   parameters=parameters,
                                   TaskType=PypeThreadTaskBase,
                                   URL="task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    db_file = makePypeLocalFile(os.path.join(pread_dir, "%s.db" % "preads"))
    #### run daligner
    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs),
                                                         pread_dir,
                                                         "preads",
                                                         db_file,
                                                         pdb_build_done,
                                                         config,
                                                         pread_aln=True)
    wf.addTasks(daligner_tasks)
    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs

    p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done"))

    @PypeTask(inputs=daligner_out,
              outputs={"da_done": p_da_done},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/pda_check")
    def check_p_da_task(self):
        system("touch %s" % fn(self.da_done))

    wf.addTask(check_p_da_task)

    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(
        fn(run_jobs), pread_dir, "preads", p_da_done, config)
    wf.addTasks(merge_tasks)
    #wf.refreshTargets(updateFreq = 30) #all

    p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done"))

    @PypeTask(inputs=merge_out,
              outputs={"p_merge_done": p_merge_done},
              TaskType=PypeThreadTaskBase,
              URL="task://localhost/pmerge_check")
    def check_p_merge_check_task(self):
        system("touch %s" % fn(self.p_merge_done))

    wf.addTask(check_p_merge_check_task)
    wf.refreshTargets(updateFreq=wait_time)  #all

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, "falcon_asm_done"))
    make_run_falcon_asm = PypeTask(
        inputs={
            "p_merge_done": p_merge_done,
            "db_file": db_file
        },
        outputs={"falcon_asm_done": falcon_asm_done},
        parameters={
            "wd": falcon_asm_dir,
            "config": config,
            "pread_dir": pread_dir
        },
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/falcon")
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets(updateFreq=wait_time)  #all
Beispiel #36
0
    job_name += "-"+str(uuid.uuid1())[:8]
    job_data = {"job_name": job_name,
                "cwd": cwd,
                "sge_option": " -pe smp 6 -q huasm ",
                "script_fn": script_fn }
    run_script(job_data, job_type = "SGE")
    wait_for_file( fn( self.job_done ), task=self, job_name=job_name )

if __name__ == "__main__":
    prefix = sys.argv[1]
    concurrent_jobs = 64
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    job_id = 0
    db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix ))
    with open("run_jobs.sh") as f :
        for l in f :
            l = l.strip().split()
            if l[0] == "daligner":
                try:
                    os.makedirs("./job_%05d" % job_id)
                except OSError:
                    pass
                os.system("cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ." % (job_id, prefix, prefix, prefix) )
                job_done = makePypeLocalFile(os.path.abspath( "./job_%05d/job_%05d_done" % (job_id,job_id)  ))
                parameters =  {"daligner_cmd": " ".join(l),
                               "cwd": os.path.join(os.getcwd(), "job_%05d" % job_id),
                               "job_id": job_id}
                make_daligner_task = PypeTask( inputs = {"db_file": db_file},
                                               outputs = {"job_done": job_done},
Beispiel #37
0
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
    merge_tasks = []
    consensus_tasks = []
    merge_out = {}
    consensus_out = {}
    mjob_data = {}

    with open(run_jobs_fn) as f:
        for l in f:
            l = l.strip().split()
            if l[0] not in ("LAsort", "LAmerge", "mv"):
                continue
            if l[0] == "LAsort":
                # We now run this part w/ daligner, but we still need
                # a small script for some book-keeping.
                p_id = int(l[2].split(".")[1])
                mjob_data.setdefault(p_id, [])
                # mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[2].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[2].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
            if l[0] == "mv":
                l2 = l[1].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[1].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[1].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        support.make_dirs("%s/m_%05d" % (wd, p_id))
        support.make_dirs("%s/preads" % (wd))
        support.make_dirs("%s/las_files" % (wd))

        merge_script_file = os.path.abspath("%s/m_%05d/m_%05d.sh" % (wd, p_id, p_id))
        with open(merge_script_file, "w") as merge_script:
            # print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix)
            for l in s_data:
                print >> merge_script, l
            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (p_id, db_prefix, p_id)
            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (p_id, db_prefix, p_id)

        job_done = makePypeLocalFile(os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)))
        parameters = {
            "merge_script": merge_script_file,
            "cwd": os.path.join(wd, "m_%05d" % p_id),
            "job_id": p_id,
            "config": config,
        }

        make_merge_task = PypeTask(
            inputs={"input_dep": input_dep},
            outputs={"job_done": job_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/m_%05d_%s" % (p_id, db_prefix),
        )
        merge_task = make_merge_task(task_run_las_merge)

        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)

        out_file = makePypeLocalFile(os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id)))
        out_done = makePypeLocalFile(os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id)))
        parameters = {"cwd": os.path.join(wd, "preads"), "job_id": p_id, "prefix": db_prefix, "config": config}
        make_c_task = PypeTask(
            inputs={"job_done": job_done},
            outputs={"out_file": out_file, "out_done": out_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/ct_%05d" % p_id,
        )

        c_task = make_c_task(task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done

    return merge_tasks, merge_out, consensus_tasks, consensus_out
Beispiel #38
0
def run_HGAP(config):

    global prepare_data
    global prepare_seed_reads
    global dist_map
    global generate_preassemble_reads
    global run_CA
    global quiver_reseq
    
    directory_for_dist_map = "dist_map"

    config["install_prefix"] = sys.prefix
    config["directory_for_dist_map"] = directory_for_dist_map

    input_fofn_fn = config["input_fofn_fn"]
    #tmpdir = config["tmpdir"]

    #prepration the distribute mapping directory 
    #try:
        #os.makedirs("%s/ec_data" % directory_for_dist_map)
        #os.makedirs("/%s/ec_data" % tmpdir)
    #except:
        #pass

    try:
        os.makedirs("%s" % directory_for_dist_map)
    except:
        pass
    try:
        os.makedirs("scripts")
    except:
        pass
    try:
        os.makedirs("CA")
    except:
        pass
    try:
        os.makedirs("sge_log")
    except:
        pass

    input_fofn = makePypeLocalFile(input_fofn_fn)
    normalized_fasta = makePypeLocalFile("all_norm.fa")
    seed_fasta = makePypeLocalFile("seeds.fa")

    wf = PypeWorkflow()
    prepare_data_task = PypeTask(inputDataObjs={"input_fofn":input_fofn},
                                 outputDataObjs={"normalized_fasta":normalized_fasta},
                                 config = config ) (prepare_data)

    prepare_seed_reads_task = PypeTask(inputDataObjs = {"normalized_fasta":normalized_fasta},
                                       outputDataObjs = {"seed_fasta":seed_fasta},
                                       config = config)(prepare_seed_reads)
                
    wf.addTasks([prepare_data_task, prepare_seed_reads_task])


    m4_data_done = makePypeLocalFile("%s/m4_data_done" % directory_for_dist_map)
    dist_map_task = PypeTask(inputDataObjs = {"normalized_fasta":normalized_fasta, "seed_fasta":seed_fasta},
                    outputDataObjs = {"m4_data_done":m4_data_done},
                    config = config) (dist_map)

    m4filtering_done = makePypeLocalFile("%s/m4filtering_done" % directory_for_dist_map)
    m4filtering_task = PypeTask(inputDataObjs = {"m4_data_done":m4_data_done},
                       outputDataObjs = {"m4filtering_done":m4filtering_done},
                       config = config) (m4_filtering)

    preassembly_done = makePypeLocalFile("%s/preassembly_done" % directory_for_dist_map)
    get_preassembled_reads_task = PypeTask( inputDataObjs = {"normalized_fasta" : normalized_fasta, 
                                                            "seed_fasta" : seed_fasta, 
                                                            "m4filtering_done" : m4filtering_done},
                                            outputDataObjs = {"preassembly_done" : preassembly_done},
                                            config = config ) (get_preassembled_reads)

    wf.addTasks([dist_map_task, m4filtering_task, get_preassembled_reads_task])


    CA_done = makePypeLocalFile("CA_done")
    run_CA_task = PypeTask( inputDataObjs = {"preassembly_done" : preassembly_done},
                  outputDataObjs = {"CA_done": CA_done},
                  config = config )(run_CA)

    wf.addTasks([run_CA_task])
        
    Quiver_done = makePypeLocalFile("Quiver_done")
    quiver_reseq_task = PypeTask( inputDataObjs = {"CA_done": CA_done, "input_fofn":input_fofn},
                                  outputDataObjs = {"Quiver_done": Quiver_done},
                                  config = config) ( quiver_reseq )

    wf.addTasks([quiver_reseq_task])
    if config["target"] == "all":
        wf.refreshTargets([Quiver_done])
    elif config["target"] == "draft_assembly":
        wf.refreshTargets([CA_done])
    elif config["target"] == "pre_assembly":
        wf.refreshTargets([preassembly_done])
Beispiel #39
0
def testDistributed(runmode, cleanup):
    logger.info("test start")
    baseDir = "."
    import random
    random.seed(1984)
    #PypeThreadWorkflow.setNumThreadAllowed(20,20)
    #wf = PypeThreadWorkflow()
    PypeMPWorkflow.setNumThreadAllowed(20,20)
    wf = PypeMPWorkflow()
    allTasks = []
    for layer in range(5):
        fN = random.randint(3,7)
        fin = [None] * fN
        fout = [None] * fN
        fmut = [None] * fN
        for w in range(fN):
            fin[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer, w) )
            fout[w] = makePypeLocalFile(baseDir + "/testdata/testfile_l%d_w%d.dat" % (layer+1, w) )
            fmut[w] = makePypeLocalFile(baseDir + "/testdata/m_testfile_l%d_w%d.dat" % (layer+1, w) )
            #wf.addObjects([fin[w], fout[w], fmut[w]])

        for w in range(fN):
            inputDataObjs = {}
            outputDataObjs = {}
            mutableDataObjs = {}
            for i in range(5):
                inputDataObjs["infile%d" % i] = random.choice(fin)

            i = 0
            for obj in random.sample(fmut,2):
                #mutableDataObjs["outfile%d" % i] = obj
                i += 1
            outputDataObjs["outfile%d" % i] = fout[w]

            shellCmd = "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in outputDataObjs.values() ]) + "\nsleep 10"
            shellCmd += "sleep 1\n" + "\n".join([ "echo %d %d ...  >> %s" % (layer, w, of.localFileName) for of in mutableDataObjs.values() ]) + "\nsleep 10"
            shellFileName = baseDir + "/testdata/task_l%d_w%d.sh" % (layer, w)
            shfile = open(shellFileName, 'w')
            print(shellCmd, file=shfile)
            shfile.close()

            if runmode == "internal":
                def t1(self):
                    runShellCmd(["sleep", "%d" % random.randint(0,20) ])

                    for of in self.outputDataObjs.values():
                        runShellCmd(["touch", of.localFileName])

                task = PypeTask(inputDataObjs = inputDataObjs,
                                outputDataObjs = outputDataObjs, 
                                mutableDataObjs = mutableDataObjs,
                                URL="task://internal/task_l%d_w%d" % (layer, w), 
                                TaskType=PypeThreadTaskBase) ( t1 )

            elif runmode == "localshell":
                task = PypeShellTask(inputDataObjs = inputDataObjs,
                                     outputDataObjs = outputDataObjs, 
                                     mutableDataObjs = mutableDataObjs,
                                     URL="task://localshell/task_l%d_w%d" % (layer, w), 
                                     TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            elif runmode == "sge": 
                task = PypeSGETask(inputDataObjs = inputDataObjs,
                                   outputDataObjs = outputDataObjs, 
                                   mutableDataObjs = mutableDataObjs,
                                   URL="task://sge/task_l%d_w%d" % (layer, w), 
                                   TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            elif runmode == "mixed":
                #distributed = random.choice( (False, True) )
                distributed = True if w % 3 == 0 else False
                task = PypeDistributibleTask(inputDataObjs = inputDataObjs,
                                   outputDataObjs = outputDataObjs,
                                   mutableDataObjs = mutableDataObjs,
                                   URL="task://sge/task_l%d_w%d" % (layer, w), 
                                   distributed=distributed,
                                   TaskType=PypeThreadTaskBase) ( "%s" % shellFileName )

            wf.addTasks([task])
            allTasks.append(task)

    for URL in wf._pypeObjects:
        prereqJobURLs = [str(u) for u in wf._RDFGraph.transitive_objects(URIRef(URL), pypeNS["prereq"])
                                        if isinstance(wf._pypeObjects[str(u)], PypeLocalFile) and str(u) != URL ]
        if len(prereqJobURLs) == 0:
            if cleanup == "1":
                os.system("echo start > %s" % wf._pypeObjects[URL].localFileName)
            pass
    wf.refreshTargets(allTasks)
    dotFile = open("test.dot","w")
    #print >>dotFile, wf.graphvizShortNameDot
    print(wf.graphvizDot, file=dotFile)
    dotFile.close()
    dotFile = open("test_short_name.dot","w")
    print(wf.graphvizShortNameDot, file=dotFile)
    dotFile.close()
    rdfFile = open("test.rdf","w")
    print(wf.RDFXML, file=rdfFile)
    rdfFile.close()
    if runmode != "internal":
        mkFile = open("test.mk","w")
        print(wf.makeFileStr, file=mkFile)
        mkFile.close()
Beispiel #40
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    setup_logger(logger_config_fn)

    fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) 
    config = get_config(parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        make_dirs(d)

    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"]))
    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
                                  outputs = {"o_fofn": rawread_fofn_plf},
                                  parameters = {},
                                  TaskType = PypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(make_fofn_abs_raw)
    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
        parameters = {"work_dir": rawread_dir,
                      "config": config}

        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
                                      outputs = {"rdb_build_done": rdb_build_done}, 
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase)

        build_rdb_task = make_build_rdb_task(build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done]) 
        

        db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks( rawread_dir, "raw_reads", db_file, rdb_build_done, config) 

        wf.addTasks(daligner_tasks)
        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs

        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )

        @PypeTask( inputs = daligner_out, 
                   outputs =  {"da_done":r_da_done},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/rda_check" )
        def check_r_da_task(self):
            os.system("touch %s" % fn(self.da_done))
        
        wf.addTask(check_r_da_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
        
        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( rawread_dir, "raw_reads", r_da_done, config )
        wf.addTasks( merge_tasks )
        if config["target"] == "overlapping":
            wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
            sys.exit(0)
        wf.addTasks( consensus_tasks )

        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

        @PypeTask( inputs = consensus_out, 
                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/cns_check" )
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn),  "w") as f:
                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >>f, fa_fn
            os.system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
                                     outputs = {"o_fofn": pread_fofn},
                                     parameters = {},
                                     TaskType = PypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) 
    parameters = {"work_dir": pread_dir,
                  "config": config}

    make_build_pdb_task  = PypeTask( inputs = { "pread_fofn": pread_fofn },
                                    outputs = { "pdb_build_done": pdb_build_done },
                                    parameters = parameters,
                                    TaskType = PypeThreadTaskBase,
                                    URL = "task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done]) 



    db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" ))
    #### run daligner
    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks( pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) 
    wf.addTasks(daligner_tasks)
    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs

    p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") )

    @PypeTask( inputs = daligner_out, 
               outputs =  {"da_done":p_da_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pda_check" )
    def check_p_da_task(self):
        os.system("touch %s" % fn(self.da_done))
    
    wf.addTask(check_p_da_task)

    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks( pread_dir, "preads", p_da_done, config )
    wf.addTasks( merge_tasks )
    #wf.refreshTargets(updateFreq = 30) #all            

    p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") )

    @PypeTask( inputs = merge_out, 
               outputs =  {"p_merge_done":p_merge_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pmerge_check" )
    def check_p_merge_check_task(self):
        os.system("touch %s" % fn(self.p_merge_done))
    
    wf.addTask(check_p_merge_check_task)
    wf.refreshTargets(updateFreq = wait_time) #all            

    
    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
    @PypeTask( inputs = {"p_merge_done": p_merge_done, "db_file":db_file}, 
               outputs =  {"falcon_asm_done":falcon_asm_done},
               parameters = {"wd": falcon_asm_dir,
                             "config": config,
                             "pread_dir": pread_dir},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/falcon" )

    def run_falcon_asm_task(self):
        wd = self.parameters["wd"]
        config = self.parameters["config"]
        install_prefix = config["install_prefix"]
        pread_dir = self.parameters["pread_dir"]
        script_dir = os.path.join( wd )
        script_fn =  os.path.join( script_dir ,"run_falcon_asm.sh" )
        
        script = []
        script.append( "set -vex" )
        script.append( "trap 'touch %s.exit' EXIT" % fn(self.falcon_asm_done) )
        script.append( "source {install_prefix}/bin/activate".format(install_prefix = install_prefix) )
        script.append( "cd %s" % pread_dir )
        # Write preads4falcon.fasta, in 1-preads_ovl:
        script.append( "DB2Falcon -U preads")
        script.append( "cd %s" % wd )
        script.append( """find %s/las_files -name "*.las" > las.fofn """ % pread_dir )
        overlap_filtering_setting = config["overlap_filtering_setting"]
        length_cutoff_pr = config["length_cutoff_pr"]
        script.append( """fc_ovlp_filter.py --db %s --fofn las.fofn %s --min_len %d > preads.ovl""" %\
                (fn(db_file), overlap_filtering_setting, length_cutoff_pr) )
        script.append( "ln -sf %s/preads4falcon.fasta ." % pread_dir)
        script.append( """fc_ovlp_to_graph.py preads.ovl --min_len %d > fc_ovlp_to_graph.log""" % length_cutoff_pr) # TODO: drop this logfile
        # Write 'p_ctg.fa' and 'a_ctg.fa':
        script.append( """fc_graph_to_contig.py""" )
        script.append( """touch %s""" % fn(self.falcon_asm_done))

        with open(script_fn, "w") as script_file:
            script_file.write("\n".join(script))

        job_data = make_job_data(self.URL, script_fn)
        job_data["sge_option"] = config["sge_option_fc"]
        run_script(job_data, job_type = config["job_type"])
        wait_for_file(fn(self.falcon_asm_done), task=self, job_name=job_data['job_name'])
    
    wf.addTask( run_falcon_asm_task )
    wf.refreshTargets(updateFreq = wait_time) #all            
def make_dirs(d):
    if not os.path.isdir(d):
        os.makedirs(d)


rawread_dir = os.path.abspath("./0-rawreads")
pread_dir = os.path.abspath("./1-preads_ovl")
asm_dir = os.path.abspath(os.path.join("./3-unzip/"))

read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
make_dirs(read_map_dir)

PypeMPWorkflow.setNumThreadAllowed(12, 12)
wf = PypeMPWorkflow()

rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db"))
rawread_id_file = makePypeLocalFile(os.path.join(rawread_dir, "raw_read_ids"))


@PypeTask(
    inputs={"rawread_db": rawread_db},
    outputs={"rawread_id_file": rawread_id_file},
    TaskType=PypeThreadTaskBase,
    URL="task://localhost/dump_rawread_ids",
)
def dump_rawread_ids(self):
    rawread_db = fn(self.rawread_db)
    rawread_id_file = fn(self.rawread_id_file)
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file))

Beispiel #42
0
def generate_read_to_contig_map(rawread_dir=rawread_dir, pread_dir=pread_dir, asm_dir=asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, "raw_reads.db"))
    rawread_id_file = makePypeLocalFile(os.path.join(rawread_dir, "raw_read_ids"))

    @PypeTask(
        inputs={"rawread_db": rawread_db},
        outputs={"rawread_id_file": rawread_id_file},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/dump_rawread_ids",
    )
    def dump_rawread_ids(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file))

    wf.addTask(dump_rawread_ids)

    pread_db = makePypeLocalFile(os.path.join(pread_dir, "preads.db"))
    pread_id_file = makePypeLocalFile(os.path.join(pread_dir, "pread_ids"))

    @PypeTask(
        inputs={"pread_db": pread_db},
        outputs={"pread_id_file": pread_id_file},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/dump_pread_ids",
    )
    def dump_pread_ids(self):
        pread_db = fn(self.pread_db)
        pread_id_file = fn(self.pread_id_file)
        os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (pread_db, pread_id_file))

    wf.addTask(dump_pread_ids)

    all_raw_las_files = {}
    for las_fn in glob.glob(os.path.join(rawread_dir, "raw_reads.*.las")):
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_raw_las_files["r_las_%s" % idx] = las_file

    all_pread_las_files = {}
    for las_fn in glob.glob(os.path.join(pread_dir, "preads.*.las")):
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        las_file = makePypeLocalFile(las_fn)
        all_pread_las_files["p_las_%s" % idx] = las_file

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, "sg_edges_list"))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, "utg_data"))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, "ctg_paths"))

    inputs = {
        "rawread_id_file": rawread_id_file,
        "pread_id_file": pread_id_file,
        "sg_edges_list": sg_edges_list,
        "utg_data": utg_data,
        "ctg_paths": ctg_paths,
    }

    read_to_contig_map = makePypeLocalFile(os.path.join(read_map_dir, "read_to_contig_map"))

    @PypeTask(
        inputs=inputs,
        outputs={"read_to_contig_map": read_to_contig_map},
        TaskType=PypeThreadTaskBase,
        URL="task://localhost/get_ctg_read_map",
    )
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list), fn(self.utg_data), fn(self.ctg_paths))

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid) / 10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault(k, set())
                    pread_to_contigs[k].add(ctg)

            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[k]):
                    print >> f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask(generate_read_to_ctg_map)

    def dump_rawread_to_ctg(self):
        rawread_db = fn(self.rawread_db)
        rawread_id_file = fn(self.rawread_id_file)
        # pread_id_file = fn( self.pread_id_file )
        las_file = fn(self.las_file)
        rawread_to_contig_file = fn(self.rawread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        rid_to_oid = open(rawread_id_file).read().split("\n")
        # pread_did_to_rid = open(pread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        rid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                rid = int(rid)
                rid_to_contigs.setdefault(rid, (oid, set()))
                rid_to_contigs[rid][1].add(ctg)

        with open(rawread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            for row in sp.check_output(shlex.split("LA4Falcon -m %s %s " % (rawread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            o_id = rid_to_oid[cur_read_id]
                            print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id

                if q_id in rid_to_contigs and len(ovlp_data) == 0:  # if the query is in some contig....
                    t_o_id, ctgs = rid_to_contigs[q_id]
                    o_id = rid_to_oid[q_id]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1

                if t_id not in rid_to_contigs:
                    continue

                t_o_id, ctgs = rid_to_contigs[t_id]
                o_id = rid_to_oid[q_id]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    def dump_pread_to_ctg(self):
        pread_db = fn(self.pread_db)
        rawread_id_file = fn(self.rawread_id_file)
        pread_id_file = fn(self.pread_id_file)
        read_to_contig_map = fn(self.read_to_contig_map)
        las_file = fn(self.las_file)
        pread_to_contig_file = fn(self.pread_to_contig_file)
        read_to_contig_map = fn(self.read_to_contig_map)

        pid_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        ovlp_data = []
        ovlp_count = 0
        longest_ovlp = 0
        a_id = None
        pid_to_contigs = {}

        with open(read_to_contig_map) as f:
            for row in f:
                row = row.strip().split()
                pid, rid, oid, ctg = row
                pid = int(pid)
                pid_to_contigs.setdefault(pid, (oid, set()))
                pid_to_contigs[pid][1].add(ctg)

        with open(pread_to_contig_file, "w") as f:
            ovlp_data = {}
            cur_read_id = None
            skip_rest = 0
            for row in sp.check_output(shlex.split("LA4Falcon -mo %s %s " % (pread_db, las_file))).splitlines():

                row = row.strip().split()
                t_id = int(row[1])
                q_id = int(row[0])
                if q_id != cur_read_id:
                    if cur_read_id == None:
                        cur_read_id = q_id
                    else:
                        if len(ovlp_data) == 0:
                            rid = pid_to_rid[cur_read_id].split("/")[1]
                            rid = int(int(rid) / 10)
                            o_id = rid_to_oid[rid]
                            print >> f, "%09d %s %s %d %d %d %d" % (cur_read_id, o_id, "NA", 0, 0, 0, 0)
                        else:
                            ovlp_v = ovlp_data.values()
                            ovlp_v.sort()
                            rank = 0
                            for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                                print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                                rank += 1
                        ovlp_data = {}
                        cur_read_id = q_id
                        skip_rest = 0

                if q_id in pid_to_contigs and len(ovlp_data) == 0:  # if the query is in some contig....
                    t_o_id, ctgs = pid_to_contigs[q_id]
                    rid = pid_to_rid[q_id].split("/")[1]
                    rid = int(int(rid) / 10)
                    o_id = rid_to_oid[rid]
                    for ctg in list(ctgs):
                        ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 1])
                        ovlp_data[ctg][0] = -int(row[7])
                        ovlp_data[ctg][1] += 1
                    skip_rest = 1

                if skip_rest == 1:
                    continue

                if t_id not in pid_to_contigs:
                    continue

                t_o_id, ctgs = pid_to_contigs[t_id]
                rid = pid_to_rid[q_id].split("/")[1]
                rid = int(int(rid) / 10)
                o_id = rid_to_oid[rid]

                for ctg in list(ctgs):
                    ovlp_data.setdefault(ctg, [0, 0, q_id, o_id, ctg, 0])
                    ovlp_data[ctg][0] += int(row[2])
                    ovlp_data[ctg][1] += 1

            if len(ovlp_data) != 0:
                ovlp_v = ovlp_data.values()
                ovlp_v.sort()
                rank = 0
                for score, count, q_id_, o_id, ctg, in_ctg in ovlp_v:
                    print >> f, "%09d %s %s %d %d %d %d" % (q_id_, o_id, ctg, count, rank, score, in_ctg)
                    rank += 1

    for las_key, las_file in all_raw_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
        make_dump_rawread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "rawread_db": rawread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file,
            },
            outputs={"rawread_to_contig_file": rawread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/r_read_to_contigs.%s" % idx,
        )
        dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)
        wf.addTask(dump_rawread_to_ctg_task)

    for las_key, las_file in all_pread_las_files.items():
        las_fn = fn(las_file)
        idx = las_fn.split("/")[-1]  # well, we will use regex someday to parse to get the number
        idx = int(idx.split(".")[1])
        pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
        make_dump_pread_to_ctg = PypeTask(
            inputs={
                "las_file": las_file,
                "pread_db": pread_db,
                "read_to_contig_map": read_to_contig_map,
                "rawread_id_file": rawread_id_file,
                "pread_id_file": pread_id_file,
            },
            outputs={"pread_to_contig_file": pread_to_contig_file},
            TaskType=PypeThreadTaskBase,
            URL="task://localhost/pread_to_contigs.%s" % idx,
        )
        dump_pread_to_ctg_task = make_dump_pread_to_ctg(dump_pread_to_ctg)
        wf.addTask(dump_pread_to_ctg_task)

    wf.refreshTargets()  # block
Beispiel #43
0
def make_dirs(d):
    if not os.path.isdir(d):
        os.makedirs(d)

rawread_dir = os.path.abspath( "./0-rawreads" )
pread_dir = os.path.abspath( "./1-preads_ovl" )
asm_dir = os.path.abspath( os.path.join("./3-unzip/") )

read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
make_dirs(read_map_dir)

PypeMPWorkflow.setNumThreadAllowed(12, 12)
wf = PypeMPWorkflow()

rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )

@PypeTask( inputs = {"rawread_db": rawread_db}, 
           outputs =  {"rawread_id_file": rawread_id_file},
           TaskType = PypeThreadTaskBase,
           URL = "task://localhost/dump_rawread_ids" )
def dump_rawread_ids(self):
    rawread_db = fn( self.rawread_db )
    rawread_id_file = fn( self.rawread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )

wf.addTask( dump_rawread_ids )

pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
pread_id_file = makePypeLocalFile( os.path.join( pread_dir, "pread_ids" ) )
Beispiel #44
0
def simpleTest():

    wf = PypeWorkflow() 
    
    # f1 and f2 are the mock input files
    f1 = makePypeLocalFile("test.fa")
    f2 = makePypeLocalFile("ref.fa")
    
    # f3 is the object of the expected output of the "testTask"
    f3 = makePypeLocalFile("aln.txt", readOnly=False)

    # create the mock files
    os.system("touch %s" % f1.localFileName)
    os.system("touch %s" % f2.localFileName)
   
    # the testTask will take f1 (as "testTask.fasta") and f2 (as "testTask.ref") and generate f3 (as "testTask.aln")
    @PypeTask(inputDataObjs={"fasta":f1, "ref":f2},
              outputDataObjs={"aln":f3},
              parameters={"a":10}, **{"b":12})
    def testTask(*argv, **kwargv):
        print("testTask is running")
        print("fasta input filename is %s" %  testTask.fasta.localFileName)
        for ft, f in testTask.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])
            runShellCmd(["sleep", "5" ])

    # the testTask will take f1 (as "testTask.fasta") and f3 (as "testTask.aln") and generate f4 (as "testTask.aln2")
    f4 = makePypeLocalFile("aln2.txt", readOnly=False)
    @PypeTask(inputDataObjs={"fasta":f1, "aln":f3},
              outputDataObjs={"aln2":f4},
              parameters={"a":10}, **{"b":12})
    def testTask2(*argv, **kwargv):
        print("testTask2 is running")
        for ft, f in testTask2.outputDataObjs.iteritems():
            #os.system("touch %s" % f.localFileName)
            runShellCmd(["touch", "%s" % f.localFileName])
    
    # one can add objects one by one to the workflow
    #wf.addObjects([f1,f2,f3,f4]) 
    #wf.addObjects([testTask, testTask2])
   
    # or, one can add the "tasks" into the workflow, the input and output data objects will be added automatically
    wf.addTasks([testTask, testTask2])

    #print out the RDFXML file that represents the workflow
    print (wf.RDFXML)
    #a graphviz dot for rendering the dependency graph if one
    print (wf.graphvizDot)

    # execute the workflow until f4 is updated
    wf.refreshTargets([f4])

    # mock the case that f1 is updated
    print("re-touch f1")
    os.system("sleep 1;touch %s;" % f1.localFileName)
    wf.refreshTargets([f4])

    # mock the case that f3 is updated
    print("re-touch f3")
    os.system("sleep 1;touch %s;" % f3.localFileName)
Beispiel #45
0
def run(
    wf,
    config,
    input_fofn_plf,
    setNumThreadAllowed,
):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure = config[
        'stop_all_jobs_on_failure']  # only matter for parallel jobs
    concurrent_jobs = config["pa_concurrent_jobs"]
    setNumThreadAllowed(concurrent_jobs, concurrent_jobs)

    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, os.path.basename(config["input_fofn"])))
    make_fofn_abs_task = PypeTask(inputs={"i_fofn": input_fofn_plf},
                                  outputs={"o_fofn": rawread_fofn_plf},
                                  parameters={},
                                  TaskType=MyFakePypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, "sleep_done"))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, "rdb_build_done"))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, "run_jobs.sh"))
        parameters = {
            "work_dir": rawread_dir,
            "sge_option": config["sge_option_da"],
            "config": config
        }

        length_cutoff_plf = makePypeLocalFile(
            os.path.join(rawread_dir, "length_cutoff"))
        raw_reads_db_plf = makePypeLocalFile(
            os.path.join(rawread_dir, "%s.db" % "raw_reads"))
        make_build_rdb_task = PypeTask(inputs={"input_fofn": rawread_fofn_plf},
                                       outputs={
                                           "rdb_build_done": rdb_build_done,
                                           "raw_reads_db": raw_reads_db_plf,
                                           "length_cutoff": length_cutoff_plf,
                                           "run_jobs": run_jobs,
                                       },
                                       parameters=parameters,
                                       TaskType=MyFakePypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", rdb_build_done, config)

        wf.addTasks(daligner_tasks)
        r_da_done = makePypeLocalFile(os.path.join(rawread_dir, "da_done"))

        parameters = {
            "nblock": raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(inputs=daligner_out,
                                        outputs={"da_done": r_da_done},
                                        parameters=parameters,
                                        TaskType=MyFakePypeThreadTaskBase,
                                        URL="task://localhost/rda_check")
        check_r_da_task = make_daligner_gather(task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, merge_out, p_ids_merge_job_done = create_merge_tasks(
            fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks(merge_tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config["target"] == "overlapping":
            sys.exit(0)
        consensus_tasks, consensus_out = create_consensus_tasks(
            rawread_dir, "raw_reads", config, p_ids_merge_job_done)
        wf.addTasks(consensus_tasks)

        r_cns_done = makePypeLocalFile(os.path.join(rawread_dir, "cns_done"))
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, "input_preads.fofn"))

        @PypeTask(inputs=consensus_out,
                  outputs={
                      "cns_done": r_cns_done,
                      "pread_fofn": pread_fofn
                  },
                  TaskType=MyFakePypeThreadTaskBase,
                  URL="task://localhost/cns_check")
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn), "w") as f:
                fn_list = glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >> f, fa_fn
            system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)

        pre_assembly_report_plf = makePypeLocalFile(
            os.path.join(rawread_dir, "pre_assembly_stats.json")
        )  #tho technically it needs pread_fofn
        make_task = PypeTask(inputs={
            "length_cutoff_fn": length_cutoff_plf,
            "raw_reads_db": raw_reads_db_plf,
            "preads_fofn": pread_fofn,
        },
                             outputs={
                                 "pre_assembly_report":
                                 pre_assembly_report_plf,
                             },
                             parameters=config,
                             TaskType=MyFakePypeThreadTaskBase,
                             URL="task://localhost/report_pre_assembly")
        task = make_task(task_report_pre_assembly)
        wf.addTask(task)

        concurrent_jobs = config["cns_concurrent_jobs"]
        setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config["target"] == "pre-assembly":
        log.info("Quitting after stage-0 for 'pre-assembly' target.")
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(
            os.path.join(pread_dir, os.path.basename(config["input_fofn"])))
        make_fofn_abs_task = PypeTask(inputs={"i_fofn": rawread_fofn_plf},
                                      outputs={"o_fofn": pread_fofn},
                                      parameters={},
                                      TaskType=MyFakePypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, "pdb_build_done"))
    parameters = {
        "work_dir": pread_dir,
        "sge_option": config["sge_option_pda"],
        "config": config
    }

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(
        pread_dir, 'preads.db'))  # Also .preads.*, of course.
    make_build_pdb_task = PypeTask(inputs={"pread_fofn": pread_fofn},
                                   outputs={
                                       "pdb_build_done": pdb_build_done,
                                       "preads_db": preads_db,
                                       "run_jobs": run_jobs,
                                   },
                                   parameters=parameters,
                                   TaskType=MyFakePypeThreadTaskBase,
                                   URL="task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    config["sge_option_da"] = config["sge_option_pda"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs),
                                                         pread_dir,
                                                         "preads",
                                                         pdb_build_done,
                                                         config,
                                                         pread_aln=True)
    wf.addTasks(daligner_tasks)

    p_da_done = makePypeLocalFile(os.path.join(pread_dir, "da_done"))
    parameters = {
        "nblock": preads_nblock,
    }
    make_daligner_gather = PypeTask(inputs=daligner_out,
                                    outputs={"da_done": p_da_done},
                                    parameters=parameters,
                                    TaskType=MyFakePypeThreadTaskBase,
                                    URL="task://localhost/pda_check")
    check_p_da_task = make_daligner_gather(task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    config["sge_option_la"] = config["sge_option_pla"]
    merge_tasks, merge_out, _ = create_merge_tasks(fn(run_jobs), pread_dir,
                                                   "preads", p_da_done, config)
    wf.addTasks(merge_tasks)

    p_merge_done = makePypeLocalFile(os.path.join(pread_dir, "p_merge_done"))

    @PypeTask(inputs=merge_out,
              outputs={"p_merge_done": p_merge_done},
              TaskType=MyFakePypeThreadTaskBase,
              URL="task://localhost/pmerge_check")
    def check_p_merge_check_task(self):
        system("touch %s" % fn(self.p_merge_done))

    wf.addTask(check_p_merge_check_task)

    concurrent_jobs = config["ovlp_concurrent_jobs"]
    setNumThreadAllowed(concurrent_jobs, concurrent_jobs)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    db2falcon_done = makePypeLocalFile(
        os.path.join(pread_dir, "db2falcon_done"))
    make_run_db2falcon = PypeTask(inputs={
        "p_merge_done": p_merge_done,
    },
                                  outputs={"db2falcon_done": db2falcon_done},
                                  parameters={
                                      "wd": pread_dir,
                                      "config": config,
                                      "sge_option": config["sge_option_fc"],
                                  },
                                  TaskType=MyFakePypeThreadTaskBase,
                                  URL="task://localhost/db2falcon")
    wf.addTask(make_run_db2falcon(task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, "falcon_asm_done"))
    make_run_falcon_asm = PypeTask(
        inputs={
            "db2falcon_done": db2falcon_done,
            "db_file": preads_db
        },
        outputs={"falcon_asm_done": falcon_asm_done},
        parameters={
            "wd": falcon_asm_dir,
            "config": config,
            "pread_dir": pread_dir,
            "sge_option": config["sge_option_fc"],
        },
        TaskType=MyFakePypeThreadTaskBase,
        URL="task://localhost/falcon_asm")
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
Beispiel #46
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info( "fc_run started with configuration %s", input_config_fn ) 
    config = support.get_config(support.parse_config(input_config_fn))
    rawread_dir = os.path.abspath("./0-rawreads")
    pread_dir = os.path.abspath("./1-preads_ovl")
    falcon_asm_dir  = os.path.abspath("./2-asm-falcon")
    script_dir = os.path.abspath("./scripts")
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    input_fofn_plf = makePypeLocalFile(os.path.basename(config["input_fofn_fn"]))
    rawread_fofn_plf = makePypeLocalFile(os.path.join(rawread_dir, os.path.basename(config["input_fofn_fn"])))
    make_fofn_abs_task = PypeTask(inputs = {"i_fofn": input_fofn_plf},
                                  outputs = {"o_fofn": rawread_fofn_plf},
                                  parameters = {},
                                  TaskType = PypeThreadTaskBase)
    fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile( os.path.join( rawread_dir, "sleep_done") )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
        run_jobs = makePypeLocalFile( os.path.join( rawread_dir, "run_jobs.sh") ) 
        parameters = {"work_dir": rawread_dir,
                      "config": config}

        make_build_rdb_task = PypeTask(inputs = {"input_fofn": rawread_fofn_plf},
                                      outputs = {"rdb_build_done": rdb_build_done,
                                                 "run_jobs": run_jobs}, 
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase)
        build_rdb_task = make_build_rdb_task(task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done]) 

        db_file = makePypeLocalFile(os.path.join( rawread_dir, "%s.db" % "raw_reads" ))
        #### run daligner
        daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), rawread_dir, "raw_reads", db_file, rdb_build_done, config) 

        wf.addTasks(daligner_tasks)
        #wf.refreshTargets(updateFreq = 60) # larger number better for more jobs

        r_da_done = makePypeLocalFile( os.path.join( rawread_dir, "da_done") )

        @PypeTask( inputs = daligner_out, 
                   outputs =  {"da_done":r_da_done},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/rda_check" )
        def check_r_da_task(self):
            os.system("touch %s" % fn(self.da_done))
        
        wf.addTask(check_r_da_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
        
        concurrent_jobs = config["cns_concurrent_jobs"]
        PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
        merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), rawread_dir, "raw_reads", r_da_done, config)
        wf.addTasks( merge_tasks )
        if config["target"] == "overlapping":
            wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs, need to call to run jobs here or the # of concurrency is changed
            sys.exit(0)
        wf.addTasks( consensus_tasks )

        r_cns_done = makePypeLocalFile( os.path.join( rawread_dir, "cns_done") )
        pread_fofn = makePypeLocalFile( os.path.join( pread_dir,  "input_preads.fofn" ) )

        @PypeTask( inputs = consensus_out, 
                   outputs =  {"cns_done":r_cns_done, "pread_fofn": pread_fofn},
                   TaskType = PypeThreadTaskBase,
                   URL = "task://localhost/cns_check" )
        def check_r_cns_task(self):
            with open(fn(self.pread_fofn),  "w") as f:
                fn_list =  glob.glob("%s/preads/out*.fasta" % rawread_dir)
                fn_list.sort()
                for fa_fn in fn_list:
                    print >>f, fa_fn
            os.system("touch %s" % fn(self.cns_done))

        wf.addTask(check_r_cns_task)
        wf.refreshTargets(updateFreq = wait_time) # larger number better for more jobs

    if config["target"] == "pre-assembly":
        sys.exit(0)

    # build pread database
    if config["input_type"] == "preads":
        pread_fofn = makePypeLocalFile(os.path.join(pread_dir, os.path.basename(config["input_fofn_fn"])))
        make_fofn_abs_task = PypeTask(inputs = {"i_fofn": rawread_fofn_plf},
                                     outputs = {"o_fofn": pread_fofn},
                                     parameters = {},
                                     TaskType = PypeThreadTaskBase)
        fofn_abs_task = make_fofn_abs_task(task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile( os.path.join( pread_dir, "pdb_build_done") ) 
    parameters = {"work_dir": pread_dir,
                  "config": config}

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    make_build_pdb_task  = PypeTask(inputs = { "pread_fofn": pread_fofn },
                                    outputs = { "pdb_build_done": pdb_build_done,
                                                "run_jobs": run_jobs},
                                    parameters = parameters,
                                    TaskType = PypeThreadTaskBase,
                                    URL = "task://localhost/build_pdb")
    build_pdb_task = make_build_pdb_task(task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done]) 



    db_file = makePypeLocalFile(os.path.join( pread_dir, "%s.db" % "preads" ))
    #### run daligner
    concurrent_jobs = config["ovlp_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    config["sge_option_da"] = config["sge_option_pda"]
    config["sge_option_la"] = config["sge_option_pla"]
    daligner_tasks, daligner_out = create_daligner_tasks(fn(run_jobs), pread_dir, "preads", db_file, pdb_build_done, config, pread_aln= True) 
    wf.addTasks(daligner_tasks)
    #wf.refreshTargets(updateFreq = 30) # larger number better for more jobs

    p_da_done = makePypeLocalFile( os.path.join( pread_dir, "da_done") )

    @PypeTask( inputs = daligner_out, 
               outputs =  {"da_done":p_da_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pda_check" )
    def check_p_da_task(self):
        os.system("touch %s" % fn(self.da_done))
    
    wf.addTask(check_p_da_task)

    merge_tasks, merge_out, consensus_tasks, consensus_out = create_merge_tasks(fn(run_jobs), pread_dir, "preads", p_da_done, config)
    wf.addTasks( merge_tasks )
    #wf.refreshTargets(updateFreq = 30) #all

    p_merge_done = makePypeLocalFile( os.path.join( pread_dir, "p_merge_done") )

    @PypeTask( inputs = merge_out, 
               outputs =  {"p_merge_done":p_merge_done},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/pmerge_check" )
    def check_p_merge_check_task(self):
        os.system("touch %s" % fn(self.p_merge_done))
    
    wf.addTask(check_p_merge_check_task)
    wf.refreshTargets(updateFreq = wait_time) #all

    
    falcon_asm_done = makePypeLocalFile( os.path.join( falcon_asm_dir, "falcon_asm_done") )
    make_run_falcon_asm = PypeTask(
               inputs = {"p_merge_done": p_merge_done, "db_file":db_file},
               outputs =  {"falcon_asm_done":falcon_asm_done},
               parameters = {"wd": falcon_asm_dir,
                             "config": config,
                             "pread_dir": pread_dir},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/falcon" )
    wf.addTask(make_run_falcon_asm(task_run_falcon_asm))
    wf.refreshTargets(updateFreq = wait_time) #all
Beispiel #47
0
from pypeflow.task import PypeTask, PypeShellTask, PypeSGETask, PypeDistributibleTask
from pypeflow.controller import PypeWorkflow, PypeThreadWorkflow, PypeMPWorkflow
from pypeflow.data import PypeLocalFile, makePypeLocalFile, fn
import logging
import time

logger = logging.getLogger()
#logger.setLevel(logging.INFO)
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)

inputs = {"input": makePypeLocalFile("/tmp/test1_input")}
outputs = {"output": makePypeLocalFile("/tmp/test1_output")}
os.system("touch /tmp/test1_input")

@PypeTask(inputs = inputs, outputs = outputs, TaskType = PypeThreadTaskBase)
def f(self):
    i = 0
    while 1:
        time.sleep(0.1)
        if self.shutdown_event != None and self.shutdown_event.is_set():
            break
        if i > 10:
            break
        i += 1
    if self.shutdown_event == None or not self.shutdown_event.is_set():
        os.system("touch %s" % fn(self.output))
Beispiel #48
0
        "cwd": cwd,
        "sge_option": " -pe smp 6 -q huasm ",
        "script_fn": script_fn
    }
    run_script(job_data, job_type="SGE")
    wait_for_file(fn(self.job_done), task=self, job_name=job_name)


if __name__ == "__main__":
    prefix = sys.argv[1]
    concurrent_jobs = 64
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    job_id = 0
    db_file = makePypeLocalFile(os.path.abspath("./%s.db" % prefix))
    with open("run_jobs.sh") as f:
        for l in f:
            l = l.strip().split()
            if l[0] == "daligner":
                try:
                    os.makedirs("./job_%05d" % job_id)
                except OSError:
                    pass
                os.system(
                    "cd ./job_%05d;ln -s ../.%s.bps .; ln -s ../.%s.idx .; ln -s ../%s.db ."
                    % (job_id, prefix, prefix, prefix))
                job_done = makePypeLocalFile(
                    os.path.abspath("./job_%05d/job_%05d_done" %
                                    (job_id, job_id)))
                parameters = {
Beispiel #49
0
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    
    ref_seq = "" 
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    PypeThreadWorkflow.setNumThreadAllowed(1, 1)
    wf = PypeThreadWorkflow()



    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_map") )
    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "variant_pos") )
    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "q_id_map") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir
    
    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
                         parameters = parameters,
                         TaskType = PypeThreadTaskBase,
                         URL = "task://localhost/het_call") (make_het_call)

    wf.addTasks([make_het_call_task])
Beispiel #50
0
    h_ctg_fa = FastaReader("./3-unzip/all_h_ctg.fa")
    for r in h_ctg_fa:
        rid = r.name.split()[0]
        ref_seq_data[rid] = r.sequence
        ctg_types[rid] = "h"

    ctg_ids = sorted( ref_seq_data.keys() )
    p_ctg_out=[]
    h_ctg_out=[]
    for ctg_id in ctg_ids:
        sequence = ref_seq_data[ctg_id]
        m_ctg_id = ctg_id.split("-")[0]
        wd = os.path.join( os.getcwd(), "./4-quiver/", m_ctg_id )
        mkdir( wd )
        ref_fasta = makePypeLocalFile(os.path.join(wd, "{ctg_id}_ref.fa".format(ctg_id = ctg_id) ) )
        read_sam = makePypeLocalFile(os.path.join( os.getcwd(), "./4-quiver/reads/" "{ctg_id}.sam".format(ctg_id = ctg_id) ) )
        cns_fasta = makePypeLocalFile(os.path.join(wd, "cns-{ctg_id}.fasta.gz".format(ctg_id = ctg_id) ) )
        cns_fastq = makePypeLocalFile(os.path.join(wd, "cns-{ctg_id}.fastq.gz".format(ctg_id = ctg_id) ) )
        job_done = makePypeLocalFile(os.path.join(wd, "{ctg_id}_quiver_done".format(ctg_id = ctg_id) ) )

        if os.path.exists(fn(read_sam)):
            if ctg_types[ctg_id] == "p":
                p_ctg_out.append( (cns_fasta, cns_fastq) )
            if ctg_types[ctg_id] == "h":
                h_ctg_out.append( (cns_fasta, cns_fastq) )
            if not os.path.exists(fn(ref_fasta)):
                with open(fn(ref_fasta),"w") as f:
                    print >>f, ">"+ctg_id
                    print >>f, sequence
            parameters = {"job_uid":"q-"+ctg_id, "wd": wd, "config":config, "ctg_id": ctg_id } 
Beispiel #51
0
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        try:
            os.makedirs(d)
        except:
            pass

    config = get_config(sys.argv[1])
    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        input_h5_fofn = makePypeLocalFile(
            os.path.abspath(config["input_fofn_fn"]))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, "rdb_build_done"))
        parameters = {"work_dir": rawread_dir, "config": config}

        make_buid_rdb_task = PypeTask(
            inputs={"input_fofn": input_h5_fofn},
            outputs={"rdb_build_done": rdb_build_done},
            parameters=parameters,
            TaskType=PypeThreadTaskBase)

        buid_rdb_task = make_buid_rdb_task(build_rdb)

        wf.addTasks([buid_rdb_task])
        wf.refreshTargets([rdb_build_done])
Beispiel #52
0
    sge_log_dir = os.path.abspath("./sge_log")

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        try:
            os.makedirs(d)
        except:
            pass

    config = get_config(sys.argv[1])
    concurrent_jobs = config["pa_concurrent_jobs"]
    PypeThreadWorkflow.setNumThreadAllowed(concurrent_jobs, concurrent_jobs)
    wf = PypeThreadWorkflow()

    if config["input_type"] == "raw":
        #### import sequences into daligner DB
        input_h5_fofn = makePypeLocalFile( os.path.abspath( config["input_fofn_fn"] ) )
        rdb_build_done = makePypeLocalFile( os.path.join( rawread_dir, "rdb_build_done") ) 
        parameters = {"work_dir": rawread_dir,
                      "config": config}

        make_buid_rdb_task = PypeTask(inputs = {"input_fofn": input_h5_fofn},
                                      outputs = {"rdb_build_done": rdb_build_done}, 
                                      parameters = parameters,
                                      TaskType = PypeThreadTaskBase)

        buid_rdb_task = make_buid_rdb_task(build_rdb)

        wf.addTasks([buid_rdb_task])
        wf.refreshTargets([rdb_build_done]) 
        
Beispiel #53
0
def create_merge_tasks(run_jobs_fn, wd, db_prefix, input_dep, config):
    merge_tasks = []
    consensus_tasks = []
    merge_out = {}
    consensus_out = {}
    mjob_data = {}

    with open(run_jobs_fn) as f:
        for l in f:
            l = l.strip().split()
            if l[0] not in ("LAsort", "LAmerge", "mv"):
                continue
            if l[0] == "LAsort":
                # We now run this part w/ daligner, but we still need
                # a small script for some book-keeping.
                p_id = int(l[2].split(".")[1])
                mjob_data.setdefault(p_id, [])
                #mjob_data[p_id].append(  " ".join(l) ) # Already done w/ daligner!
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[2].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[2].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
            if l[0] == "mv":
                l2 = l[1].split(".")
                if l2[1][0] == "L":
                    p_id = int(l[1].split(".")[2])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))
                else:
                    p_id = int(l[1].split(".")[1])
                    mjob_data.setdefault(p_id, [])
                    mjob_data[p_id].append(" ".join(l))

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        support.make_dirs("%s/m_%05d" % (wd, p_id))
        support.make_dirs("%s/preads" % (wd))
        support.make_dirs("%s/las_files" % (wd))

        merge_script_file = os.path.abspath("%s/m_%05d/m_%05d.sh" %
                                            (wd, p_id, p_id))
        with open(merge_script_file, "w") as merge_script:
            #print >> merge_script, """for f in `find .. -wholename "*job*/%s.%d.%s.*.*.las"`; do ln -sf $f .; done""" % (db_prefix, p_id, db_prefix)
            for l in s_data:
                print >> merge_script, l
            print >> merge_script, "ln -sf ../m_%05d/%s.%d.las ../las_files" % (
                p_id, db_prefix, p_id)
            print >> merge_script, "ln -sf ./m_%05d/%s.%d.las .. " % (
                p_id, db_prefix, p_id)

        job_done = makePypeLocalFile(
            os.path.abspath("%s/m_%05d/m_%05d_done" % (wd, p_id, p_id)))
        parameters = {
            "merge_script": merge_script_file,
            "cwd": os.path.join(wd, "m_%05d" % p_id),
            "job_id": p_id,
            "config": config
        }

        make_merge_task = PypeTask(inputs={"input_dep": input_dep},
                                   outputs={"job_done": job_done},
                                   parameters=parameters,
                                   TaskType=PypeThreadTaskBase,
                                   URL="task://localhost/m_%05d_%s" %
                                   (p_id, db_prefix))
        merge_task = make_merge_task(task_run_las_merge)

        merge_out["mjob_%d" % p_id] = job_done
        merge_tasks.append(merge_task)

        out_file = makePypeLocalFile(
            os.path.abspath("%s/preads/out.%05d.fasta" % (wd, p_id)))
        out_done = makePypeLocalFile(
            os.path.abspath("%s/preads/c_%05d_done" % (wd, p_id)))
        parameters = {
            "cwd": os.path.join(wd, "preads"),
            "job_id": p_id,
            "prefix": db_prefix,
            "config": config
        }
        make_c_task = PypeTask(inputs={"job_done": job_done},
                               outputs={
                                   "out_file": out_file,
                                   "out_done": out_done
                               },
                               parameters=parameters,
                               TaskType=PypeThreadTaskBase,
                               URL="task://localhost/ct_%05d" % p_id)

        c_task = make_c_task(task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out["cjob_%d" % p_id] = out_done

    return merge_tasks, merge_out, consensus_tasks, consensus_out
Beispiel #54
0
            if l[0] == "LAsort":
                p_id = int( l[2].split(".")[1] )
                mjob_data.setdefault( p_id, [] )
                mjob_data[p_id].append(  " ".join(l) )
            if l[0] == "LAmerge":
                l2 = l[2].split(".")
                if l2[1] == "L2":
                    p_id = int(  l[2].split(".")[2] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )
                else:
                    p_id = int( l[2].split(".")[1] )
                    mjob_data.setdefault( p_id, [] )
                    mjob_data[p_id].append(  " ".join(l) )

    db_file = makePypeLocalFile(os.path.abspath( "./%s.db" % prefix ))

    for p_id in mjob_data:
        s_data = mjob_data[p_id]

        try:
            os.makedirs("./p_%05d" % p_id)
            os.makedirs("./p_%05d/sge_log" % p_id)
        except OSError:
            pass
        try:
            os.makedirs("./preads")
        except OSError:
            pass
        try:
            os.makedirs("./las_files")
Beispiel #55
0
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):

    read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
    make_dirs(read_map_dir)

    PypeMPWorkflow.setNumThreadAllowed(12, 12)
    wf = PypeMPWorkflow()

    rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
    rawread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "raw_read_ids" ) )

    @PypeTask( inputs = {"rawread_db": rawread_db},
               outputs =  {"rawread_id_file": rawread_id_file},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/dump_rawread_ids" )
    def dump_rawread_ids(self):
        rawread_db = fn( self.rawread_db )
        rawread_id_file = fn( self.rawread_id_file )
        os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )

    wf.addTask( dump_rawread_ids )

    pread_db = makePypeLocalFile( os.path.join( pread_dir, "preads.db" ) )
    pread_id_file = makePypeLocalFile( os.path.join( read_map_dir, "pread_ids" ) )

    @PypeTask( inputs = {"pread_db": pread_db},
               outputs =  {"pread_id_file": pread_id_file},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/dump_pread_ids" )
    def dump_pread_ids(self):
        pread_db = fn( self.pread_db )
        pread_id_file = fn( self.pread_id_file )
        os.system("DBshow -n %s | tr -d '>' | LD_LIBRARY_PATH= awk '{print $1}' > %s" % (pread_db, pread_id_file) )

    wf.addTask( dump_pread_ids )

    wf.refreshTargets() # block

    sg_edges_list = makePypeLocalFile( os.path.join(asm_dir, "sg_edges_list") )
    utg_data = makePypeLocalFile( os.path.join(asm_dir, "utg_data") )
    ctg_paths = makePypeLocalFile( os.path.join(asm_dir, "ctg_paths") )

    inputs = { "rawread_id_file": rawread_id_file,
               "pread_id_file": pread_id_file,
               "sg_edges_list": sg_edges_list,
               "utg_data": utg_data,
               "ctg_paths": ctg_paths }

    read_to_contig_map = makePypeLocalFile( os.path.join(read_map_dir, "read_to_contig_map") )

    @PypeTask( inputs = inputs,
               outputs = {"read_to_contig_map": read_to_contig_map},
               TaskType = PypeThreadTaskBase,
               URL = "task://localhost/get_ctg_read_map" )
    def generate_read_to_ctg_map(self):
        rawread_id_file = fn( self.rawread_id_file )
        pread_id_file = fn( self.pread_id_file )
        read_to_contig_map = fn( self.read_to_contig_map )

        pread_did_to_rid = open(pread_id_file).read().split("\n")
        rid_to_oid = open(rawread_id_file).read().split("\n")

        asm_G = AsmGraph(fn(self.sg_edges_list),
                         fn(self.utg_data),
                         fn(self.ctg_paths) )

        pread_to_contigs = {}

        with open(read_to_contig_map, "w") as f:
            for ctg in asm_G.ctg_data:
                if ctg[-1] == "R":
                    continue
                ctg_g = asm_G.get_sg_for_ctg(ctg)
                for n in ctg_g.nodes():
                    pid = int(n.split(":")[0])

                    rid = pread_did_to_rid[pid].split("/")[1]
                    rid = int(int(rid)/10)
                    oid = rid_to_oid[rid]
                    k = (pid, rid, oid)
                    pread_to_contigs.setdefault( k, set() )
                    pread_to_contigs[ k ].add( ctg )


            for k in pread_to_contigs:
                pid, rid, oid = k
                for ctg in list(pread_to_contigs[ k ]):
                    print >>f, "%09d %09d %s %s" % (pid, rid, oid, ctg)

    wf.addTask( generate_read_to_ctg_map )

    wf.refreshTargets() # block