Esempio n. 1
0
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
    make_dirs(read_map_dir)

    wf = PypeProcWatcherWorkflow(max_jobs=12, )
    """
            job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'])
    """

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
    rawread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids'))

    task = PypeTask(inputs={'rawread_db': rawread_db},
                    outputs={'rawread_id_file': rawread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_rawread_ids')
    wf.addTask(task(dump_rawread_ids))

    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
    pread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids'))

    task = PypeTask(inputs={'pread_db': pread_db},
                    outputs={'pread_id_file': pread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_pread_ids')
    wf.addTask(task(dump_pread_ids))

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'sg_edges_list': sg_edges_list,
        'utg_data': utg_data,
        'ctg_paths': ctg_paths
    }

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))

    task = PypeTask(inputs=inputs,
                    outputs={'read_to_contig_map': read_to_contig_map},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/get_ctg_read_map')
    wf.addTask(task(generate_read_to_ctg_map))

    wf.refreshTargets()  # block
Esempio n. 2
0
def create_tasks_pbalign(chunk_json_pfn, referenceset_pfn, parameters):
    """Create a pbalign task for each chunk, plus a gathering task.
    """
    tasks = list()
    gathering = dict()
    chunk_dir = os.path.dirname(fn(chunk_json_pfn))
    for i, subreadset_fn in enumerate(
            sorted(
                yield_pipeline_chunk_names_from_json(open(fn(chunk_json_pfn)),
                                                     '$chunk.subreadset_id'))):
        wdir = 'run-pbalign-{:02d}'.format(i)
        subreadset_fn = os.path.join(chunk_dir,
                                     os.path.basename(subreadset_fn))
        subreadset_pfn = makePypeLocalFile(subreadset_fn)
        unmapped_pfn = makePypeLocalFile(
            '{wdir}/unmapped.txt'.format(**locals()))
        alignmentset_pfn = makePypeLocalFile(
            '{wdir}/align.subreads.{i:02d}.alignmentset.xml'.format(
                **locals()))
        gathering['unmapped_{:02d}'.format(i)] = unmapped_pfn
        gathering['alignmentsets_{:02d}'.format(i)] = alignmentset_pfn
        """Also produces:
        aligned.subreads.i.alignmentset.bam
        aligned.subreads.i.alignmentset.bam.bai
        aligned.subreads.i.alignmentset.bam.pbi
        """
        make_task = PypeTask(
            inputs={
                "chunk_json": chunk_json_pfn,
                "dataset": subreadset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "alignmentset": alignmentset_pfn,
                "unmapped": unmapped_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_pbalign)
        tasks.append(task)
    o_alignmentset_pfn = makePypeLocalFile(
        'run-pbalign_gather/aligned.subreads.alignmentset.xml')
    o_unmapped_pfn = makePypeLocalFile('run-pbalign_gather/unmapped.txt')
    make_task = PypeTask(
        inputs=gathering,
        outputs={
            "o_ds": o_alignmentset_pfn,
            "o_unmapped": o_unmapped_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_gather)
    tasks.append(task)
    return tasks, alignmentset_pfn
Esempio n. 3
0
def create_consensus_tasks(basedir, scatter_fn):
    consensus_tasks = []
    consensus_out = {}
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        aligner = parameters['config']['aligner']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = int(parameters['job_id'])
        cns_label = 'cns_%05d' % int(p_id)
        wdir = os.path.join(basedir, 'preads', cns_label)
        if aligner == 'minialign':
            for input_fn, input_fpath in inputs.items():
                inputs[input_fn] = makePypeLocalFile(input_fpath)
        make_c_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        c_task = make_c_task(pype_tasks.task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out['cjob_%d' % p_id] = outputs['out_file']
    return consensus_tasks, consensus_out
Esempio n. 4
0
def create_merge_tasks(basedir, scatter_fn):
    tasks = []
    p_ids_merged_las = {}  # for consensus
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        aligner = parameters['config']['aligner']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = parameters['job_id']
        #merge_script = parameters['merge_script']
        #sge_option = parameters['sge_option']
        wdir = os.path.join(basedir, 'm_%05d' % p_id)
        if aligner == 'minialign':
            for input_fn, input_fpath in inputs.items():
                inputs[input_fn] = makePypeLocalFile(input_fpath)
        make_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        task = make_task(pype_tasks.task_run_las_merge if aligner ==
                         'daligner' else pype_tasks.task_run_aligner)
        tasks.append(task)
        ovl_file = task.outputs[
            'merged_las' if aligner == 'daligner' else
            'ovl_fn']  # these are relative, so we need the PypeLocalFiles
        p_ids_merged_las[p_id] = ovl_file
    return tasks, p_ids_merged_las
Esempio n. 5
0
def create_daligner_tasks(basedir, scatter_fn):
    tasks = []
    tasks_out = {}
    try:
        content = json.loads(open(scatter_fn).read())  # array of descriptions
    except Exception:
        msg = 'Failed to read JSON from {!r}'.format(scatter_fn)
        LOG.exception(msg)
        raise Exception(msg)
    for section in content:
        parameters = section['parameters']
        inputs = section['inputs']
        inputs['scatter_fn'] = os.path.abspath(scatter_fn)
        outputs = section['outputs']
        URL = section['URL']
        job_uid = parameters['job_uid']
        wdir = os.path.join(basedir, 'job_%s' % job_uid)
        make_daligner_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        daligner_task = make_daligner_task(pype_tasks.task_run_daligner)
        tasks.append(daligner_task)
        # these are relative, so we need the PypeLocalFiles
        tasks_out['ajob_%s' % job_uid] = daligner_task.outputs['job_done']
    return tasks, tasks_out
Esempio n. 6
0
def create_merge_tasks(basedir, scatter_fn):
    tasks = []
    p_ids_merged_las = {}  # for consensus
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = parameters['job_id']
        #merge_script = parameters['merge_script']
        #sge_option = parameters['sge_option']
        wdir = os.path.join(basedir, 'm_%05d' % p_id)
        make_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        task = make_task(pype_tasks.task_run_las_merge)
        tasks.append(task)
        las_fn = task.outputs[
            'merged_las']  # these are relative, so we need the PypeLocalFiles
        p_ids_merged_las[p_id] = las_fn
    return tasks, p_ids_merged_las
Esempio n. 7
0
def get_read_hctg_map(asm_dir, hasm_dir, read_to_contig_map_fn):
    wf = PypeProcWatcherWorkflow(
        max_jobs=
        12,  # TODO: Why was NumThreads ever set? There is only one task!
    )

    rawread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
    pread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
    h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids"))
    #make_dirs(os.path.dirname(os.path.abspath(read_to_contig_map_fn)) # Workflow does this.

    read_to_contig_map_plf = makePypeLocalFile(read_to_contig_map_fn)

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'h_ctg_edges': h_ctg_edges,
        'p_ctg_edges': p_ctg_edges,
        'h_ctg_ids': h_ctg_ids
    }

    make_task = PypeTask(
        inputs=inputs,
        outputs={'read_to_contig_map': read_to_contig_map_plf},
    )
    wf.addTask(make_task(generate_read_to_hctg_map))
    wf.refreshTargets()  # block
Esempio n. 8
0
def create_quiver_jobs(scattered_quiver_plf):
    scattered_quiver_fn = fn(scattered_quiver_plf)
    jobs = json.loads(open(scattered_quiver_fn).read())
    #ctg_ids = sorted(jobs['ref_seq_data'])
    p_ctg_out = []
    h_ctg_out = []
    job_done_plfs = {}
    for job in jobs:
        ctg_id = job['ctg_id']
        m_ctg_id = ctg_id.split('-')[0]
        wd = os.path.join(os.getcwd(), './4-quiver/', m_ctg_id)
        ref_fasta = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_ref.fa'.format(ctg_id=ctg_id)))
        read_sam = makePypeLocalFile(
            os.path.join(
                os.getcwd(), './4-quiver/reads/'
                '{ctg_id}.sam'.format(ctg_id=ctg_id)))
        cns_fasta = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fasta.gz'.format(ctg_id=ctg_id)))
        cns_fastq = makePypeLocalFile(
            os.path.join(wd, 'cns-{ctg_id}.fastq.gz'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(wd, '{ctg_id}_quiver_done'.format(ctg_id=ctg_id)))

        if os.path.exists(
                fn(read_sam
                   )):  # TODO(CD): Ask Jason what we should do if missing SAM.
            if ctg_types[ctg_id] == 'p':
                p_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            elif ctg_types[ctg_id] == 'h':
                h_ctg_out.append((fn(cns_fasta), fn(cns_fastq)))
            else:
                LOG.warning(
                    'Type is {!r}, not "p" or "h". Why are we running Quiver?'.
                    format(ctg_types[ctg_id]))
            parameters = {
                'job_uid': 'q-' + ctg_id,
                'wd': wd,
                'config': config,
                'ctg_id': ctg_id
            }
            make_quiver_task = PypeTask(
                inputs={
                    'ref_fasta': ref_fasta,
                    'read_sam': read_sam,
                    'scattered_quiver': scattered_quiver_plf,
                },
                outputs={
                    'cns_fasta': cns_fasta,
                    'cns_fastq': cns_fastq,
                    'job_done': job_done
                },
                parameters=parameters,
            )
            quiver_task = make_quiver_task(task_run_quiver)
            wf.addTask(quiver_task)
            job_done_plfs['{}'.format(ctg_id)] = job_done
    #sge_quiver = config['sge_quiver']
    return p_ctg_out, h_ctg_out, job_done_plfs
Esempio n. 9
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
        inputs=inputs,  # consensus_out
        outputs={'preads_fofn': preads_fofn_plf},
    )
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Esempio n. 10
0
def create_ma_merge_gather_task(wd, inputs):
    ovl_fofn_plf = makePypeLocalFile(os.path.join(wd, 'ovl.fofn'))

    make_task = PypeTask(
        inputs=inputs,  # p_ids_merged_las
        outputs={
            'ovl_fofn': ovl_fofn_plf,
        },
    )
    task = make_task(pype_tasks.task_ma_merge_gather)
    return task, ovl_fofn_plf
Esempio n. 11
0
def create_consensus_gather_task(wd, inputs):
    # Happens only in stage-0.
    preads_fofn_plf = makePypeLocalFile(os.path.join(wd, 'input_preads.fofn'))

    make_cns_gather_task = PypeTask(
                inputs = inputs, # consensus_out
                outputs =  {'preads_fofn': preads_fofn_plf},
                TaskType = MyFakePypeThreadTaskBase,
                URL = 'task://localhost/cns_gather' )
    task = make_cns_gather_task(pype_tasks.task_cns_gather)
    return task, preads_fofn_plf
Esempio n. 12
0
def main():
    lfn = 'logging-cfg.json'
    if os.path.exists(lfn):
        logging.config.dictConfig(json.load(open(lfn)))
    else:
        logging.basicConfig()
        logging.getLogger().setLevel(logging.NOTSET)
        try:
            import logging_tree
            logging_tree.printout()
        except ImportError:
            pass
    log.debug('DEBUG LOGGING ON')
    log.warning('Available via env: JOB_TYPE={}, SLEEP_S={}'.format(
        JOB_TYPE, SLEEP_S))
    exitOnFailure = False
    concurrent_jobs = 2
    Workflow = PypeProcWatcherWorkflow
    wf = Workflow(job_type=JOB_TYPE)
    wf.max_jobs = concurrent_jobs

    par = dict(sleep_s=SLEEP_S)
    DIR = 'mytmp'
    makedirs(DIR)
    f0 = makePypeLocalFile('mytmp/f0')
    f1 = makePypeLocalFile('mytmp/f1')
    make_task = PypeTask(
        inputs={},
        outputs={'f0': f0},
        parameters=par,
    )
    task = make_task(taskrun0)
    wf.addTasks([task])
    make_task = PypeTask(
        inputs={'f0': f0},
        outputs={'f1': f1},
        parameters=par,
    )
    task = make_task(taskrun1)
    wf.addTasks([task])
    wf.refreshTargets([task])
Esempio n. 13
0
def create_tasks_fasta2DB(split_subreadsets_fofn_pfn, parameters):
    tasks = list()
    next_inputs = dict()
    topdir = os.path.join(os.path.dirname(fn(split_subreadsets_fofn_pfn)),
                          'run-fastas2fofn')
    # Create the fastas in parallel.
    for i, chunk_fn in enumerate(
            open(fn(split_subreadsets_fofn_pfn)).read().splitlines()):
        wdir = os.path.join(topdir, 'fasta_job_{:03d}'.format(i))  # TODO: 02
        chunk_pfn = makePypeLocalFile(os.path.join(wdir, chunk_fn))
        fasta_done_fn = os.path.join(wdir,
                                     'chunk_{:03d}_done'.format(i))  # TODO: 02
        # By depending on a sentinel, we are allowed to delete fastas later.
        # Note: i might not match num in chunk_fn, but that is ok
        fasta_done_pfn = makePypeLocalFile(fasta_done_fn)
        make_task = PypeTask(
            inputs={
                "dataset": chunk_pfn,
            },
            outputs={
                "fasta_done": fasta_done_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_bam2fasta_dexta)
        tasks.append(task)
        next_inputs['fasta_{}_done'.format(i)] = fasta_done_pfn
        #fasta_fn = base_from_done(fasta_done_fn) + '.fasta'  # By convention.
    # Create the FOFN of fastas.
    fasta_fofn_fn = os.path.join(topdir, 'fasta.fofn')
    fasta_fofn_pfn = makePypeLocalFile(fasta_fofn_fn)
    make_task = PypeTask(
        inputs=next_inputs,
        outputs={
            "fofn": fasta_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fastas2fofn)
    tasks.append(task)
    return tasks, fasta_fofn_pfn
Esempio n. 14
0
def run(
    wf,
    config,
):
    exitOnFailure = True
    #try:
    #    # Make it always re-run.
    #    os.remove('out.txt')
    #except Exception:
    #    LOG.exception('could not remove out.txt')
    o0 = makePypeLocalFile('hey0/out.txt')
    make_task = PypeTask(
        inputs={},
        outputs={'o0': o0},
        parameters={},
    )
    t0 = make_task(mymod.say_hey0)
    o1 = makePypeLocalFile('hey1/out.txt')
    make_task = PypeTask(
        inputs={'i0': o0},
        outputs={'o1': o1},
        parameters={},
    )
    t1 = make_task(mymod.say_hey1)
    wf.addTasks([t0, t1])  # for new-simple-way, we could add just t1
    N = int(os.environ.get('N', '1'))
    for i in range(N):
        make_task = PypeTask(
            inputs={},
            outputs={
                'out': 'touched',
            },
            #outputs = {'out': 'hey-{}/touched'.format(i),},
            parameters={},
            wdir='hey-{}'.format(i),
        )
        t = make_task(mymod.touchit)
        wf.addTask(t)
    wf.refreshTargets(exitOnFailure=exitOnFailure)
Esempio n. 15
0
def create_merge_gather_task(wd, inputs):
    las_fofn_plf = makePypeLocalFile(os.path.join(wd, 'las.fofn'))
    las_fopfn_plf = makePypeLocalFile(os.path.join(wd, 'las.fopfn'))

    make_task = PypeTask(inputs = inputs, # p_ids_merged_las
                         outputs =  {'las_fofn': las_fofn_plf,
                                     'las_fopfn': las_fopfn_plf,
                         },
                         TaskType = MyFakePypeThreadTaskBase,
    )
    #                     URL = 'task://localhost/pmerge_gather')
    task = make_task(pype_tasks.task_merge_gather)
    return task, las_fofn_plf, las_fopfn_plf
Esempio n. 16
0
def create_task_old():
    i1 = './in/i1'
    o1 = './run/dir1/o1.txt'
    i1 = makePypeLocalFile(i1)
    o1 = makePypeLocalFile(o1)
    parameters = {}
    make_task = PypeTask(
            inputs={
                'i1': i1,
            },
            outputs={
                'o1': o1,
            },
            parameters=parameters,
            )
    return make_task(taskA)
Esempio n. 17
0
def gen_task(script, inputs, outputs, parameters={}):
    def validate_dict(mydict):
        "Python identifiers are illegal as keys."
        try:
            collections.namedtuple('validate', mydict.keys())
        except ValueError as exc:
            LOG.exception('Bad key name in task definition dict {!r}'.format(mydict))
            raise
    validate_dict(inputs)
    validate_dict(outputs)
    validate_dict(parameters)
    parameters['_bash_'] = script
    make_task = PypeTask(
            inputs={k: makePypeLocalFile(v) for k,v in inputs.iteritems()},
            outputs={k: makePypeLocalFile(v) for k,v in outputs.iteritems()},
            parameters=parameters,
            )
    return make_task(task_generic_bash_script)
Esempio n. 18
0
def create_daligner_tasks(basedir, scatter_fn):
    tasks = []
    tasks_out = {}
    content = json.loads(open(scatter_fn).read())  # array of descriptions
    for section in content:
        parameters = section['parameters']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        job_uid = parameters['job_uid']
        wdir = os.path.join(basedir, 'job_%s' % job_uid)
        make_daligner_task = PypeTask(
            inputs=inputs,
            outputs=outputs,
            parameters=parameters,
            wdir=wdir,
        )
        daligner_task = make_daligner_task(pype_tasks.task_run_daligner)
        tasks.append(daligner_task)
        tasks_out['ajob_%s' % job_uid] = daligner_task.outputs[
            'job_done']  # these are relative, so we need the PypeLocalFiles
    return tasks, tasks_out
Esempio n. 19
0
def create_consensus_tasks(basedir, scatter_fn):
    consensus_tasks = []
    consensus_out ={}
    content = json.loads(open(scatter_fn).read()) # array of descriptions
    for section in content:
        parameters = section['parameters']
        inputs = section['inputs']
        inputs['scatter_fn'] = scatter_fn
        outputs = section['outputs']
        URL = section['URL']
        p_id = int(parameters['job_id'])
        cns_label = 'cns_%05d' %int(p_id)
        wdir = os.path.join(basedir, 'preads', cns_label)
        make_c_task = PypeTask(inputs = inputs,
                               outputs = outputs,
                               parameters = parameters,
                               TaskType = MyFakePypeThreadTaskBase,
                               URL = URL,
                               wdir = wdir,
        )
        c_task = make_c_task(pype_tasks.task_run_consensus)
        consensus_tasks.append(c_task)
        consensus_out['cjob_%d' % p_id] = outputs['out_file']
    return consensus_tasks, consensus_out
Esempio n. 20
0
def flow(config):
    #import pdb; pdb.set_trace()
    parameters = config
    #exitOnFailure=config['stop_all_jobs_on_failure'] # only matter for parallel jobs
    #wf.refreshTargets(exitOnFailure=exitOnFailure)
    #wf = PypeThreadWorkflow()
    #wf = PypeWorkflow()
    #wf = PypeWorkflow(job_type='local')
    log.debug('config=\n{}'.format(pprint.pformat(config)))
    # Set some defaults on the Workflow.
    concurrent_jobs = 24  # TODO: Configure this.
    wf = PypeWorkflow(
        job_type=config['hgap'].get('job_type'),
        job_queue=config['hgap'].get('job_queue'),
        watcher_type=config['hgap'].get('pwatcher_type', 'blocking'),
        #watcher_directory=config['pwatcher_directory'],
        max_jobs=config['hgap'].get('max_jobs', concurrent_jobs),
    )

    use_tmpdir = config['hgap'].get('use_tmpdir')
    if use_tmpdir:
        log.info('hgap.use_tmpdir={!r}'.format(use_tmpdir))
        if use_tmpdir is not True and '/' in use_tmpdir:
            tempfile.tempdir = use_tmpdir
            log.info('Using tempfile.tempdir={}'.format(tempfile.tempdir))
        else:
            log.info('Keeping tempfile.tempdir={}'.format(tempfile.tempdir))

    dataset_pfn = makePypeLocalFile(config['pbsmrtpipe']['input_files'][0])
    filtered_pfn = makePypeLocalFile('run-filterbam/filtered.subreadset.xml')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
        },
        outputs={
            "filtered": filtered_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_filterbam)
    wf.addTask(task)

    split_subreadsets_fofn_pfn = makePypeLocalFile(
        'run-bam_scatter/chunked_subreadsets.fofn')
    make_task = PypeTask(
        inputs={
            "dataset": filtered_pfn,
        },
        outputs={
            "split_subreadsets_fofn": split_subreadsets_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_bam_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, input_fofn_pfn = create_tasks_fasta2DB(split_subreadsets_fofn_pfn,
                                                  parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    fc_cfg_pfn = makePypeLocalFile('run-falcon/fc.cfg')
    fc_json_config_pfn = makePypeLocalFile('run-falcon/fc.json')
    make_task = PypeTask(
        inputs={
            "input_fofn": input_fofn_pfn,
        },
        outputs={
            "fc_cfg": fc_cfg_pfn,
            "fc_json_config": fc_json_config_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_prepare_falcon)
    wf.addTask(task)
    wf.refreshTargets()

    input_config_fn = fn(fc_cfg_pfn)
    with sys.cd('run-falcon'):
        falcon_kit.mains.run1.fc_run_logger = falcon_kit.run_support.logger = logging.getLogger(
            'falcon')
        fc_cfg = falcon_kit.run_support.get_dict_from_old_falcon_cfg(
            falcon_kit.run_support.parse_config(input_config_fn))
        # FALCON takes over the workflow for a while.
        # (For debugging, it is still possible to restart just fc_run, if desired.)
        falcon_asm_done_pfn = falcon_kit.mains.run1.run(
            wf,
            fc_cfg,
            input_config_fn,
            input_fofn_plf=input_fofn_pfn,  # _pfn should be _plf, but oh well
        )
        wf.max_jobs = concurrent_jobs  # in case Falcon changed this

    # Here is a hard-linking task to help us attach falcon into the dependency graph.
    falcon_link_done_pfn = makePypeLocalFile(
        'run-falcon_link/falcon_link_done')
    make_task = PypeTask(
        inputs={
            "falcon_asm_done": falcon_asm_done_pfn,
        },
        outputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_falcon_link)
    wf.addTask(task)

    # The rest of the workflow will operate on datasets, not fasta directly.
    referenceset_pfn = makePypeLocalFile(
        'run-fasta2referenceset/asm.referenceset.xml')
    make_task = PypeTask(
        inputs={
            "falcon_link_done": falcon_link_done_pfn,
        },
        outputs={
            "referenceset": referenceset_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_fasta2referenceset)
    wf.addTask(task)
    wf.refreshTargets()

    # scatter the subreads for pbalign
    """Produces:
    pbalign_chunk.json
    chunk_subreadset_*.subreadset.xml
    """
    pbalign_chunk_json_pfn = makePypeLocalFile(
        'run-pbalign-scatter/pbalign_chunk.json')
    make_task = PypeTask(
        inputs={
            "dataset": dataset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_json": pbalign_chunk_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_pbalign_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    # After scattering, we can specify the pbalign jobs.
    tasks, alignmentset_pfn = create_tasks_pbalign(pbalign_chunk_json_pfn,
                                                   referenceset_pfn,
                                                   parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # scatter the alignmentset for genomic_consensus (variantCaller)
    """Produces:
    gc.chunks.fofn
    ???*.congitset.xml ???
    """
    gc_chunks_fofn_pfn = makePypeLocalFile('run-gc_scatter/gc.chunks.fofn')
    make_task = PypeTask(
        inputs={
            "alignmentset": alignmentset_pfn,
            "referenceset": referenceset_pfn,
        },
        outputs={
            "out_fofn": gc_chunks_fofn_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_scatter)
    wf.addTask(task)
    wf.refreshTargets()

    tasks, contigset_pfn, gathered_fastq_pfn = create_tasks_gc(
        gc_chunks_fofn_pfn, referenceset_pfn, parameters)
    wf.addTasks(tasks)
    wf.refreshTargets()

    # Final report

    polished_assembly_report_json_pfn = makePypeLocalFile(
        'run-polished-assembly-report/polished_assembly_report.json')
    make_task = PypeTask(
        inputs={
            "referenceset": referenceset_pfn,
            "gathered_alignmentset": alignmentset_pfn,
            "polished_fastq": gathered_fastq_pfn,
        },
        outputs={
            "report_json": polished_assembly_report_json_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_polished_assembly_report)
    wf.addTask(task)

    wf.refreshTargets()

    par_dir = os.path.dirname(fn(polished_assembly_report_json_pfn))
    sys.symlink(os.path.join(par_dir, 'polished_coverage_vs_quality.png'))
    sys.symlink(os.path.join(par_dir,
                             'polished_coverage_vs_quality_thumb.png'))
    #return
    ##############

    if not os.path.exists('foo.bar1'):
        sys.system('touch foo.bar1')
    foo_fn1 = makePypeLocalFile('foo.bar1')
    foo_fn2 = makePypeLocalFile('foo.bar2')
    make_task = PypeTask(
        inputs={
            "foo1": foo_fn1,
        },
        outputs={
            "foo2": foo_fn2,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_foo)
    wf.addTask(task)
    wf.refreshTargets()
Esempio n. 21
0
def create_tasks_gc(fofn_pfn, referenceset_pfn, parameters):
    """Create a gc task for each chunk, plus a gathering task.
    Here is the convoluted workflow:
    1. For each gc instance "chunk":
      A. variantCaller writes .fasta
      B. We create a contigset for the .fasta
    2. We keep the contigset output filenames in a FOFN (from run_gc_scatter)
       and pass that to run_gc_gather().
    3. We read each contigset and add them to a gathered ContigSet.
    4. We "consolidate" their underlying .fasta "resources",
       assuming their filenames match except extenion.
    5. Finally, we write the gathered contigset.
    Whew!
    We also gather fastq here, for convenience.
    """
    tasks = list()
    contigsets = dict()
    fastqs = dict()
    # Assume fofn of gc chunks are all relative to the dir of the fofn.
    for i, alignmentset_bn in enumerate(open(fn(fofn_pfn)).read().split()):
        alignmentset_fn = os.path.join(os.path.dirname(fn(fofn_pfn)),
                                       alignmentset_bn)
        wdir = 'run-gc-{:02}'.format(i)
        mkdirs(wdir)  # Assume CWD is correct.
        alignmentset_pfn = makePypeLocalFile(
            alignmentset_fn)  # New pfn cuz it was not pfn before.
        polished_fastq_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.fastq'))
        variants_gff_pfn = makePypeLocalFile(os.path.join(
            wdir, 'variants.gff'))
        consensus_contigset_pfn = makePypeLocalFile(
            os.path.join(wdir, 'consensus.contigset.xml'))
        """Also produces:
        consensus.fasta
        consensus.fasta.fai

        And note that these files names are important, as pbcoretools gathering expects
        a particular pattern.
        """
        contigsets['contigset_{:02d}'.format(i)] = consensus_contigset_pfn
        fastqs['fastq_{:02d}'.format(i)] = polished_fastq_pfn
        make_task = PypeTask(
            inputs={
                "alignmentset": alignmentset_pfn,
                "referenceset": referenceset_pfn,
            },
            outputs={
                "polished_fastq": polished_fastq_pfn,
                "variants_gff": variants_gff_pfn,
                "consensus_contigset": consensus_contigset_pfn,
            },
            parameters=parameters,
        )
        task = make_task(start_task.task_genomic_consensus)
        tasks.append(task)
    contigset_pfn = makePypeLocalFile('run-gc-gather/contigset.xml')
    gathered_fastq_pfn = makePypeLocalFile('run-gc-gather/gathered.fastq')
    inputs = dict(contigsets)
    inputs.update(fastqs)
    log.debug('inputs to gc_gather:{}'.format(pprint.pformat(contigsets)))
    make_task = PypeTask(
        inputs=inputs,
        outputs={
            "ds_out": contigset_pfn,
            "fastq_out": gathered_fastq_pfn,
        },
        parameters=parameters,
    )
    task = make_task(start_task.task_gc_gather)
    tasks.append(task)
    return tasks, contigset_pfn, gathered_fastq_pfn
Esempio n. 22
0
                rank += 1


phased_reads =  makePypeLocalFile(os.path.join(asm_dir, "all_phased_reads"))


for las_key, las_file in all_raw_las_files.items():
    las_fn = fn(las_file)
    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1]) 
    rawread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "rawread_to_contigs.%s" % idx))
    make_dump_rawread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
                                                    "rawread_db": rawread_db, 
                                                    "read_to_contig_map": read_to_contig_map, 
                                                    "rawread_id_file": rawread_id_file,
                                                    "pread_id_file": pread_id_file,
                                                    "phased_reads" : phased_reads},
                                      outputs = { "rawread_to_contig_file": rawread_to_contig_file },
                                      TaskType = PypeThreadTaskBase,
                                      URL = "task://localhost/r_read_to_contigs.%s" % idx )
    dump_rawread_to_ctg_task = make_dump_rawread_to_ctg(dump_rawread_to_ctg)                           
    wf.addTask( dump_rawread_to_ctg_task )

for las_key, las_file in all_pread_las_files.items():
    las_fn = fn(las_file)
    idx = las_fn.split("/")[-1] # well, we will use regex someday to parse to get the number
    idx = int(idx.split(".")[1]) 
    pread_to_contig_file = makePypeLocalFile(os.path.join(read_map_dir, "pread_to_contigs.%s" % idx))
    make_dump_pread_to_ctg = PypeTask( inputs = { "las_file": las_file, 
                                                  "pread_db": pread_db, 
                                                  "read_to_contig_map": read_to_contig_map, 
Esempio n. 23
0
def run(
    wf,
    config,
    input_config_fn,
    input_fofn_plf,
):
    """
    Preconditions (for now):
    * fc_run_logger
    * run_support.logger
    """
    rawread_dir = os.path.abspath('./0-rawreads')
    pread_dir = os.path.abspath('./1-preads_ovl')
    falcon_asm_dir = os.path.abspath('./2-asm-falcon')
    script_dir = os.path.abspath('./scripts')
    sge_log_dir = os.path.abspath('./sge_log')

    for d in (rawread_dir, pread_dir, falcon_asm_dir, script_dir, sge_log_dir):
        support.make_dirs(d)

    exitOnFailure = config[
        'stop_all_jobs_on_failure']  # only matter for parallel jobs
    wf.max_jobs = config['default_concurrent_jobs']

    rawread_fofn_plf = makePypeLocalFile(
        os.path.join(rawread_dir, 'raw-fofn-abs',
                     os.path.basename(config['input_fofn'])))
    make_fofn_abs_task = PypeTask(
        inputs={'i_fofn': input_fofn_plf},
        outputs={'o_fofn': rawread_fofn_plf},
        parameters={},
    )
    fofn_abs_task = make_fofn_abs_task(pype_tasks.task_make_fofn_abs_raw)

    wf.addTasks([fofn_abs_task])
    wf.refreshTargets([fofn_abs_task])

    if config['input_type'] == 'raw':
        #### import sequences into daligner DB
        sleep_done = makePypeLocalFile(os.path.join(rawread_dir, 'sleep_done'))
        rdb_build_done = makePypeLocalFile(
            os.path.join(rawread_dir, 'rdb_build_done'))
        run_jobs = makePypeLocalFile(os.path.join(rawread_dir, 'run_jobs.sh'))
        parameters = {
            'work_dir': rawread_dir,
            'sge_option': config['sge_option_da'],
            'config_fn': input_config_fn,
            'config': config
        }

        length_cutoff_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'length_cutoff'))
        raw_reads_db_plf = makePypeLocalFile(
            os.path.join(rawread_dir, '%s.db' % 'raw_reads'))
        make_build_rdb_task = PypeTask(
            inputs={'input_fofn': rawread_fofn_plf},
            outputs={
                'rdb_build_done': rdb_build_done,
                'raw_reads_db': raw_reads_db_plf,
                'length_cutoff': length_cutoff_plf,
                'run_jobs': run_jobs,
            },
            parameters=parameters,
        )
        build_rdb_task = make_build_rdb_task(pype_tasks.task_build_rdb)

        wf.addTasks([build_rdb_task])
        wf.refreshTargets([rdb_build_done])

        raw_reads_nblock = support.get_nblock(fn(raw_reads_db_plf))
        #### run daligner
        wf.max_jobs = config['da_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'daligner-scatter',
                                     'scattered.json')
        make_daligner_scatter = PypeTask(
            inputs={
                'run_jobs_fn': run_jobs,
                'db_build_done': rdb_build_done,
            },
            outputs={
                'scatter_fn': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'nblock': raw_reads_nblock,
                'pread_aln': False,
                'config': config,
            },
        )
        task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        daligner_tasks, daligner_out = create_daligner_tasks(
            rawread_dir, scattered_plf)

        wf.addTasks(daligner_tasks)
        r_gathered_las_plf = makePypeLocalFile(
            os.path.join(rawread_dir, 'raw-gather', 'gathered_las.txt'))

        parameters = {
            'nblock': raw_reads_nblock,
        }
        make_daligner_gather = PypeTask(
            inputs=daligner_out,
            outputs={'gathered': r_gathered_las_plf},
            parameters=parameters,
        )
        check_r_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
        wf.addTask(check_r_da_task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        # Merge .las files.
        wf.max_jobs = config['la_concurrent_jobs']
        scattered_plf = os.path.join(rawread_dir, 'merge-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'run_jobs': run_jobs,
                'gathered_las': r_gathered_las_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_merge_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        merge_tasks, p_ids_merged_las = create_merge_tasks(
            rawread_dir, scattered_plf)
        wf.addTasks(merge_tasks)
        task, _, las_fopfn_plf = create_merge_gather_task(
            os.path.join(rawread_dir, 'merge-gather'), p_ids_merged_las)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        if config['target'] == 'overlapping':
            sys.exit(0)

        # Produce new FOFN of preads fasta, based on consensus of overlaps.
        wf.max_jobs = config['cns_concurrent_jobs']

        scattered_plf = os.path.join(rawread_dir, 'cns-scatter',
                                     'scattered.json')
        make_task = PypeTask(
            inputs={
                'gathered': las_fopfn_plf,
                'db': raw_reads_db_plf,
            },
            outputs={
                'scattered': scattered_plf,
            },
            parameters={
                'db_prefix': 'raw_reads',
                'config': config,
            },
        )
        task = make_task(pype_tasks.task_consensus_scatter)
        wf.addTask(task)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        tasks, consensus_out = create_consensus_tasks(rawread_dir,
                                                      scattered_plf)
        wf.addTasks(tasks)
        wf.refreshTargets(exitOnFailure=exitOnFailure)

        task, preads_fofn_plf = create_consensus_gather_task(
            os.path.join(rawread_dir, 'preads'), consensus_out)
        wf.addTask(task)

        rdir = os.path.join(rawread_dir, 'report')
        pre_assembly_report_plf = makePypeLocalFile(
            os.path.join(rdir, 'pre_assembly_stats.json'))
        parameters = dict(config)
        parameters['cwd'] = rdir
        make_task = PypeTask(
            inputs={
                'length_cutoff_fn': length_cutoff_plf,
                'raw_reads_db': raw_reads_db_plf,
                'preads_fofn': preads_fofn_plf,
            },
            outputs={
                'pre_assembly_report': pre_assembly_report_plf,
            },
            parameters=parameters,
        )
        task = make_task(pype_tasks.task_report_pre_assembly)
        wf.addTask(task)

        wf.refreshTargets(exitOnFailure=exitOnFailure)

    if config['target'] == 'pre-assembly':
        log.info('Quitting after stage-0 for "pre-assembly" target.')
        sys.exit(0)

    # build pread database
    if config['input_type'] == 'preads':
        preads_fofn_plf = makePypeLocalFile(
            os.path.join(pread_dir, 'preads-fofn-abs',
                         os.path.basename(config['input_fofn'])))
        make_fofn_abs_task = PypeTask(
            inputs={'i_fofn': rawread_fofn_plf},
            outputs={'o_fofn': preads_fofn_plf},
            parameters={},
        )
        fofn_abs_task = make_fofn_abs_task(
            pype_tasks.task_make_fofn_abs_preads)
        wf.addTasks([fofn_abs_task])
        wf.refreshTargets([fofn_abs_task])

    pdb_build_done = makePypeLocalFile(
        os.path.join(pread_dir, 'pdb_build_done'))
    parameters = {
        'work_dir': pread_dir,
        'sge_option': config['sge_option_pda'],
        'config_fn': input_config_fn,
        'config': config
    }

    run_jobs = makePypeLocalFile(os.path.join(pread_dir, 'run_jobs.sh'))
    preads_db = makePypeLocalFile(os.path.join(
        pread_dir, 'preads.db'))  # Also .preads.*, of course.
    make_build_pdb_task = PypeTask(
        inputs={'preads_fofn': preads_fofn_plf},
        outputs={
            'pdb_build_done': pdb_build_done,
            'preads_db': preads_db,
            'run_jobs': run_jobs,
        },
        parameters=parameters,
    )
    build_pdb_task = make_build_pdb_task(pype_tasks.task_build_pdb)

    wf.addTasks([build_pdb_task])
    wf.refreshTargets([pdb_build_done])

    preads_nblock = support.get_nblock(fn(preads_db))
    #### run daligner
    wf.max_jobs = config['pda_concurrent_jobs']
    config['sge_option_da'] = config['sge_option_pda']

    scattered_plf = os.path.join(pread_dir, 'daligner-scatter',
                                 'scattered.json')
    make_daligner_scatter = PypeTask(
        inputs={
            'run_jobs_fn': run_jobs,
            'db_build_done': pdb_build_done,
        },
        outputs={
            'scatter_fn': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'nblock': preads_nblock,
            'pread_aln': True,
            'config': config,
        },
    )
    task = make_daligner_scatter(pype_tasks.task_daligner_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    daligner_tasks, daligner_out = create_daligner_tasks(
        pread_dir, scattered_plf)
    wf.addTasks(daligner_tasks)

    p_gathered_las_plf = makePypeLocalFile(
        os.path.join(pread_dir, 'gathered-las', 'gathered-las.txt'))
    parameters = {
        'nblock': preads_nblock,
    }
    make_daligner_gather = PypeTask(
        inputs=daligner_out,
        outputs={'gathered': p_gathered_las_plf},
        parameters=parameters,
    )
    check_p_da_task = make_daligner_gather(pype_tasks.task_daligner_gather)
    wf.addTask(check_p_da_task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Merge .las files.
    wf.max_jobs = config['pla_concurrent_jobs']
    config['sge_option_la'] = config['sge_option_pla']
    scattered_plf = os.path.join(pread_dir, 'merge-scatter', 'scattered.json')
    make_task = PypeTask(
        inputs={
            'run_jobs': run_jobs,
            'gathered_las': p_gathered_las_plf,
        },
        outputs={
            'scattered': scattered_plf,
        },
        parameters={
            'db_prefix': 'preads',
            'config': config,
        },
    )
    task = make_task(pype_tasks.task_merge_scatter)
    wf.addTask(task)
    wf.refreshTargets(exitOnFailure=exitOnFailure)

    merge_tasks, p_ids_merged_las = create_merge_tasks(pread_dir,
                                                       scattered_plf)
    wf.addTasks(merge_tasks)
    task, las_fofn_plf, las_fopfn_plf = create_merge_gather_task(
        os.path.join(pread_dir, 'merge-gather'), p_ids_merged_las)
    wf.addTask(task)

    wf.refreshTargets(exitOnFailure=exitOnFailure)

    # Draft assembly (called 'fc_' for now)
    wf.max_jobs = config['fc_concurrent_jobs']
    db2falcon_dir = os.path.join(pread_dir, 'db2falcon')
    db2falcon_done = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'db2falcon_done'))
    preads4falcon_plf = makePypeLocalFile(
        os.path.join(db2falcon_dir, 'preads4falcon.fasta'))
    make_run_db2falcon = PypeTask(
        inputs={
            'las_fofn_plf': las_fofn_plf,
            'preads_db': preads_db,
        },
        outputs={
            'db2falcon_done': db2falcon_done,
            'preads4falcon': preads4falcon_plf,
        },
        parameters={
            'wd': db2falcon_dir,
            'config': config,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_db2falcon(pype_tasks.task_run_db2falcon))

    falcon_asm_done = makePypeLocalFile(
        os.path.join(falcon_asm_dir, 'falcon_asm_done'))
    make_run_falcon_asm = PypeTask(
        inputs={
            'db2falcon_done': db2falcon_done,
            'db_file': preads_db,
            'preads4falcon': preads4falcon_plf,
            'las_fofn': las_fofn_plf,
        },
        outputs={'falcon_asm_done': falcon_asm_done},
        parameters={
            'wd': falcon_asm_dir,
            'config': config,
            'pread_dir': pread_dir,
            'sge_option': config['sge_option_fc'],
        },
    )
    wf.addTask(make_run_falcon_asm(pype_tasks.task_run_falcon_asm))
    wf.refreshTargets()

    return falcon_asm_done
Esempio n. 24
0
def unzip_all(config):
    unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs']
    unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs']
    wf = PypeProcWatcherWorkflow(
        max_jobs=unzip_blasr_concurrent_jobs,
        job_type=config['job_type'],
        job_queue=config.get('job_queue'),
        sge_option=config.get('sge_option'),
        watcher_type=config.get('pwatcher_type'),
        #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
        use_tmpdir=config.get('use_tmpdir'),
    )

    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
    wdir = os.path.abspath('./3-unzip/reads')
    parameters = {
        'wd': wdir,
        'config': config,
        'sge_option': config['sge_track_reads'],
    }
    job_done = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_done'))
    make_track_reads_task = PypeTask(
        inputs={'falcon_asm_done': falcon_asm_done},
        outputs={
            'job_done': job_done,
            'ctg_list_file': ctg_list_file
        },
        parameters=parameters,
        wdir=wdir,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ctg_ids = []
    with open('./3-unzip/reads/ctg_list') as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))
        #mkdir(wd)
        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id=ctg_id)))

        parameters = {
            'job_uid': 'aln-' + ctg_id,
            'wd': blasr_dir,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_blasr_aln'],
        }
        make_blasr_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'read_fasta': read_fasta
            },
            outputs={
                'ctg_aln_out': ctg_aln_out,
                'job_done': job_done
            },
            parameters=parameters,
        )
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)
    wf.refreshTargets()

    wf.max_jobs = unzip_phasing_concurrent_jobs
    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))

        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))

        phasing_dir = os.path.join(wd, 'phasing')
        job_done = makePypeLocalFile(
            os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id=ctg_id)))
        rid_to_phase_out = makePypeLocalFile(
            os.path.join(
                wd,
                'rid_to_phase.{ctg_id}'.format(ctg_id=ctg_id)))  # TODO: ???
        all_ctg_out['r2p.{ctg_id}'.format(
            ctg_id=ctg_id)] = rid_to_phase_out  # implicit output?

        parameters = {
            'job_uid': 'ha-' + ctg_id,
            'wd': wd,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_phasing'],
        }
        make_phasing_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'aln_bam': ctg_aln_out
            },
            outputs={'job_done': job_done},
            parameters=parameters,
        )
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)
    wf.refreshTargets()

    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
    #mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(
        os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
    task = PypeTask(
        inputs=all_ctg_out,
        outputs={'rid_to_phase_all': rid_to_phase_all},
    )(get_rid_to_phase_all)
    wf.addTask(task)

    parameters['wd'] = hasm_wd
    parameters['sge_option'] = config['sge_hasm']
    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
    make_hasm_task = PypeTask(
        inputs={'rid_to_phase_all': rid_to_phase_all},
        outputs={'job_done': job_done},
        parameters=parameters,
    )
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()
Esempio n. 25
0
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)

    if len(sys.argv) < 2:
        print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        'job_type': job_type,
        'sge_quiver': sge_quiver,
        'sge_track_reads': sge_track_reads,
        'input_bam_fofn': input_bam_fofn,
        'smrt_bin': smrt_bin
    }
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser

    wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, )

    abscwd = os.path.abspath('.')
    parameters = {
        'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'),
        'config': config
    }
    hasm_done_plf = makePypeLocalFile(
        './3-unzip/1-hasm/hasm_done')  # by convention
    track_reads_h_done_plf = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_h_done'))
    make_track_reads_task = PypeTask(
        inputs={'hasm_done': hasm_done_plf},
        outputs={'job_done': track_reads_h_done_plf},
        parameters=parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)
    #sge_track_reads = config['sge_track_reads']

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile(
        '4-quiver/quiver_scatter/scattered.json')
    make_task = PypeTask(
        inputs={
            'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
            'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
            'track_reads_h_done': track_reads_h_done_plf,
        },
        outputs={
            'scattered_quiver_json': scattered_quiver_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(
        scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
        inputs=job_done_plfs,
        outputs={
            'job_done': gather_done_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
        inputs={
            'gathered_p_ctg': gathered_p_ctg_plf,
            'gathered_h_ctg': gathered_h_ctg_plf,
            'gather_done': gather_done_plf,
        },
        outputs={
            'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
            'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
            'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
            'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
            'job_done': zcat_done_plf,
        },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
Esempio n. 26
0
def phasing(args):
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    samtools = args.samtools

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    wf = PypeProcWatcherWorkflow(
            max_jobs=1,
    )

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir
    parameters["samtools"] = samtools

    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
                         parameters = parameters,
    ) (make_het_call)

    wf.addTasks([make_het_call_task])




    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["base_dir"] = base_dir
    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
                                      outputs = { "atable_file": atable_file },
                                      parameters = parameters,
    ) (generate_association_table)

    wf.addTasks([generate_association_table_task])




    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
                                      outputs = { "phased_variant_file": phased_variant_file },
    ) (get_phased_blocks)
    wf.addTasks([get_phased_blocks_task])




    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
                                                 "q_id_map_file": q_id_map_file,
                                                 "phased_variant_file": phased_variant_file },
                                      outputs = { "phased_read_file": phased_read_file },
                                      parameters = {"ctg_id": ctg_id},
    ) (get_phased_reads)
    wf.addTasks([get_phased_reads_task])


    wf.refreshTargets()