Example #1
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global LOG
    LOG = run_support.setup_logger(logger_config_fn)
    lfs_setstripe_maybe(path='.', stripe=12)

    LOG.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = run_support.parse_cfg_file(input_config_fn)
        import json
        dumped = json.dumps(config, indent=2, separators=(',', ': '), sort_keys=True)
        LOG.info('cfg=\n{}'.format(dumped))
    except Exception:
        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    general_config = config['General']
    check_general_config(general_config, input_config_fn)
    input_fofn_fn = general_config['input_fofn']
    genome_size = int(general_config['genome_size'])
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(job_defaults=config['job.defaults'],
                                 squash=squash,
    )
    general_config['ver'] = '100'
    # Store config as JSON, available to many tasks.
    config_fn = './config.json' # must not be in a task-dir
    io.serialize(config_fn, config)
    run(wf, config,
        os.path.abspath(config_fn),
        input_fofn_fn=input_fofn_fn,
        )
Example #2
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global fc_run_logger
    fc_run_logger = support.setup_logger(logger_config_fn)

    fc_run_logger.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.get_dict_from_old_falcon_cfg(
            support.parse_config(input_config_fn))
    except Exception:
        fc_run_logger.exception(
            'Failed to parse config "{}".'.format(input_config_fn))
        raise
    input_fofn_plf = makePypeLocalFile(config['input_fofn'])
    genome_size = config.get('genome_size')
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(
        job_type=config['job_type'],
        job_queue=config['job_queue'],
        sge_option=config.get('sge_option', ''),
        watcher_type=config['pwatcher_type'],
        watcher_directory=config['pwatcher_directory'],
        use_tmpdir=config.get('use_tmpdir'),
        squash=squash)
    run(
        wf,
        config,
        os.path.abspath(input_config_fn),
        input_fofn_plf=input_fofn_plf,
    )
Example #3
0
def main(args):

    job_defaults = dict(
        njobs=1,
        NPROC=1,
        MB=24000,
        submit=submit,
        job_type='local',
        pwatcher_type='blocking',
    )

    wf = PypeProcWatcherWorkflow(job_defaults=job_defaults, )

    seq_dataset_lst = os.path.abspath(args["<reads.lst>"])
    read_db_abs_prefix, read_db = run_build_db(wf, args, seq_dataset_lst)
    LOG.info('Finished: {}'.format(read_db_abs_prefix))

    index_abs_prefix, read_idx = run_build_idx(wf, args, read_db_abs_prefix)
    LOG.info('Finished: {}'.format(index_abs_prefix))

    ovlp_in = {}
    ovlp_in.update(read_db)
    ovlp_in.update(read_idx)
    ovlp_out = run_overlapper(wf, args, read_db_abs_prefix, index_abs_prefix,
                              ovlp_in)

    LOG.info('Finished: {}'.format(ovlp_out))

    ctg_out = run_ovlp_to_ctg(wf, args, read_db_abs_prefix, read_db, ovlp_out)
    LOG.info('Finished: {}'.format(ctg_out))

    if args['--with-consensus']:
        cns_out = run_cns(wf, args, read_db_abs_prefix, read_db,
                          index_abs_prefix, read_idx, ctg_out)
        LOG.info('Finished: {}'.format(cns_out))
Example #4
0
def get_read_hctg_map(asm_dir, hasm_dir, read_to_contig_map_fn):
    wf = PypeProcWatcherWorkflow(
        max_jobs=
        12,  # TODO: Why was NumThreads ever set? There is only one task!
    )

    rawread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_rawread_ids/rawread_ids'))
    pread_id_file = makePypeLocalFile(
        os.path.join(asm_dir, 'read_maps/dump_pread_ids/pread_ids'))
    h_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_h_ctg_edges'))
    p_ctg_edges = makePypeLocalFile(os.path.join(hasm_dir, 'all_p_ctg_edges'))
    h_ctg_ids = makePypeLocalFile(os.path.join(hasm_dir, "all_h_ctg_ids"))
    #make_dirs(os.path.dirname(os.path.abspath(read_to_contig_map_fn)) # Workflow does this.

    read_to_contig_map_plf = makePypeLocalFile(read_to_contig_map_fn)

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'h_ctg_edges': h_ctg_edges,
        'p_ctg_edges': p_ctg_edges,
        'h_ctg_ids': h_ctg_ids
    }

    make_task = PypeTask(
        inputs=inputs,
        outputs={'read_to_contig_map': read_to_contig_map_plf},
    )
    wf.addTask(make_task(generate_read_to_hctg_map))
    wf.refreshTargets()  # block
Example #5
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global CFG
    setup_logger(logger_config_fn)
    LOG.info('config={!r}, log={!r}'.format(input_config_fn, logger_config_fn))
    CFG = parse_config(input_config_fn)['General']
    LOG.info('CFG=\n{}'.format(pprint.pformat(CFG)))

    wf = PypeProcWatcherWorkflow(
        job_type=CFG['job_type'],
        job_queue=CFG['job_queue'],
        watcher_type=CFG['watcher_type'],
        max_jobs=CFG.get('max_jobs', 2),
    )
    run(wf, CFG)
Example #6
0
def setup_workflow():
    PRODUCERS.clear()  # Forget any PypeTasks already defined.

    job_defaults = {
        'job_type': 'string',
        #'submit': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}',
        'submit': 'bash -C ${CMD}',
        #'JOB_OPTS': '-pe smp 8 -q bigmem',
        'pwatcher_type': 'blocking',
        #'pwatcher_directory': config.get('pwatcher_directory', 'mypwatcher'),
        #'use_tmpdir': '/scratch',
        'njobs': 4,
    }
    wf = PypeProcWatcherWorkflow(job_defaults=job_defaults, )
    return wf
Example #7
0
def setup_workflow():
    config = {
        'job_type': 'string',
        'job_queue': 'bash -C ${CMD} >| ${STDOUT_FILE} 2>| ${STDERR_FILE}',
        #'job_queue': 'bash -C ${CMD}',
        #'sge_option': '-pe smp 8 -q bigmem',
        'pwatcher_type': 'blocking',
        #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
        #'use_tmpdir': '/scratch',
    }
    wf = PypeProcWatcherWorkflow(
        max_jobs=4,
        job_type=config['job_type'],
        job_queue=config.get('job_queue'),
        sge_option=config.get('sge_option'),
        watcher_type=config.get('pwatcher_type'),
        #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
        use_tmpdir=config.get('use_tmpdir'),
    )
    return wf
Example #8
0
def main1(prog_name, input_config_fn, logger_config_fn=None):
    global LOG
    LOG = support.setup_logger(logger_config_fn)
    lfs_setstripe_maybe(path='.', stripe=12)

    LOG.info('fc_run started with configuration %s', input_config_fn)
    try:
        config = support.parse_cfg_file(input_config_fn)
        import json
        dumped = json.dumps(config,
                            indent=2,
                            separators=(',', ': '),
                            sort_keys=True)
        LOG.info('cfg=\n{}'.format(dumped))
    except Exception:
        LOG.exception('Failed to parse config "{}".'.format(input_config_fn))
        raise
    general_config = config['General']
    assert 'input_fofn' in general_config, 'Missing "input_fofn" in {}.'.format(
        input_config_fn)
    input_fofn_plf = makePypeLocalFile(general_config['input_fofn'])
    genome_size = int(general_config.get('genome_size', '0'))
    squash = True if 0 < genome_size < 1000000 else False
    wf = PypeProcWatcherWorkflow(
        job_defaults=config['job.defaults'],
        squash=squash,
    )
    general_config['ver'] = '100'
    config_fn = './config.json'  # must not be in a task-dir
    io.serialize(config_fn, config)
    with open('foo.snake', 'w') as snakemake_writer:
        rule_writer = snakemake.SnakemakeRuleWriter(snakemake_writer)
        run(
            wf,
            config,
            rule_writer,
            os.path.abspath(config_fn),
            input_fofn_plf=input_fofn_plf,
        )
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
    make_dirs(read_map_dir)

    wf = PypeProcWatcherWorkflow(max_jobs=12, )
    """
            job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'])
    """

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
    rawread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_rawread_ids', 'rawread_ids'))

    task = PypeTask(inputs={'rawread_db': rawread_db},
                    outputs={'rawread_id_file': rawread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_rawread_ids')
    wf.addTask(task(dump_rawread_ids))

    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
    pread_id_file = makePypeLocalFile(
        os.path.join(read_map_dir, 'dump_pread_ids', 'pread_ids'))

    task = PypeTask(inputs={'pread_db': pread_db},
                    outputs={'pread_id_file': pread_id_file},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/dump_pread_ids')
    wf.addTask(task(dump_pread_ids))

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))

    inputs = {
        'rawread_id_file': rawread_id_file,
        'pread_id_file': pread_id_file,
        'sg_edges_list': sg_edges_list,
        'utg_data': utg_data,
        'ctg_paths': ctg_paths
    }

    read_to_contig_map = makePypeLocalFile(
        os.path.join(read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))

    task = PypeTask(inputs=inputs,
                    outputs={'read_to_contig_map': read_to_contig_map},
                    TaskType=PypeThreadTaskBase,
                    URL='task://localhost/get_ctg_read_map')
    wf.addTask(task(generate_read_to_ctg_map))

    wf.refreshTargets()  # block
Example #10
0
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)

    if len(sys.argv) < 2:
        print >> sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)

    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)

    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip',
                                               'quiver_concurrent_jobs')

    config = {
        'job_type': job_type,
        'sge_quiver': sge_quiver,
        'sge_track_reads': sge_track_reads,
        'input_bam_fofn': input_bam_fofn,
        'smrt_bin': smrt_bin
    }
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser

    wf = PypeProcWatcherWorkflow(max_jobs=quiver_concurrent_jobs, )

    abscwd = os.path.abspath('.')
    parameters = {
        'wd': os.path.join(abscwd, '4-quiver', 'track_reads_h'),
        'config': config
    }
    hasm_done_plf = makePypeLocalFile(
        './3-unzip/1-hasm/hasm_done')  # by convention
    track_reads_h_done_plf = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_h_done'))
    make_track_reads_task = PypeTask(
        inputs={'hasm_done': hasm_done_plf},
        outputs={'job_done': track_reads_h_done_plf},
        parameters=parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)
    #sge_track_reads = config['sge_track_reads']

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile(
        '4-quiver/quiver_scatter/scattered.json')
    make_task = PypeTask(
        inputs={
            'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
            'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
            'track_reads_h_done': track_reads_h_done_plf,
        },
        outputs={
            'scattered_quiver_json': scattered_quiver_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(
        scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
        inputs=job_done_plfs,
        outputs={
            'job_done': gather_done_plf,
        },
        parameters={},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile(
        '4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
        inputs={
            'gathered_p_ctg': gathered_p_ctg_plf,
            'gathered_h_ctg': gathered_h_ctg_plf,
            'gather_done': gather_done_plf,
        },
        outputs={
            'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
            'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
            'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
            'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
            'job_done': zcat_done_plf,
        },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
Example #11
0
def phasing(args):
    bam_fn = args.bam
    fasta_fn = args.fasta
    ctg_id = args.ctg_id
    base_dir = args.base_dir
    samtools = args.samtools

    ref_seq = ""
    for r in FastaReader(fasta_fn):
        rid = r.name.split()[0]
        if rid != ctg_id:
            continue
        ref_seq = r.sequence.upper()

    wf = PypeProcWatcherWorkflow(
            max_jobs=1,
    )

    bam_file = makePypeLocalFile(bam_fn)
    vmap_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_map") )
    vpos_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "variant_pos") )
    q_id_map_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'het_call', "q_id_map") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["ref_seq"] = ref_seq
    parameters["base_dir"] = base_dir
    parameters["samtools"] = samtools

    make_het_call_task = PypeTask( inputs = { "bam_file": bam_file },
                         outputs = { "vmap_file": vmap_file, "vpos_file": vpos_file, "q_id_map_file": q_id_map_file },
                         parameters = parameters,
    ) (make_het_call)

    wf.addTasks([make_het_call_task])




    atable_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'g_atable', "atable") )
    parameters = {}
    parameters["ctg_id"] = ctg_id
    parameters["base_dir"] = base_dir
    generate_association_table_task = PypeTask( inputs = { "vmap_file": vmap_file },
                                      outputs = { "atable_file": atable_file },
                                      parameters = parameters,
    ) (generate_association_table)

    wf.addTasks([generate_association_table_task])




    phased_variant_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, 'get_phased_blocks', "phased_variants") )
    get_phased_blocks_task = PypeTask( inputs = { "vmap_file": vmap_file, "atable_file": atable_file },
                                      outputs = { "phased_variant_file": phased_variant_file },
    ) (get_phased_blocks)
    wf.addTasks([get_phased_blocks_task])




    phased_read_file = makePypeLocalFile( os.path.join(base_dir, ctg_id, "phased_reads") )
    get_phased_reads_task = PypeTask( inputs = { "vmap_file": vmap_file,
                                                 "q_id_map_file": q_id_map_file,
                                                 "phased_variant_file": phased_variant_file },
                                      outputs = { "phased_read_file": phased_read_file },
                                      parameters = {"ctg_id": ctg_id},
    ) (get_phased_reads)
    wf.addTasks([get_phased_reads_task])


    wf.refreshTargets()
Example #12
0
def unzip_all(config):
    unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs']
    unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs']
    wf = PypeProcWatcherWorkflow(
        max_jobs=unzip_blasr_concurrent_jobs,
        job_type=config['job_type'],
        job_queue=config.get('job_queue'),
        sge_option=config.get('sge_option'),
        watcher_type=config.get('pwatcher_type'),
        #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
        use_tmpdir=config.get('use_tmpdir'),
    )

    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
    wdir = os.path.abspath('./3-unzip/reads')
    parameters = {
        'wd': wdir,
        'config': config,
        'sge_option': config['sge_track_reads'],
    }
    job_done = makePypeLocalFile(
        os.path.join(parameters['wd'], 'track_reads_done'))
    make_track_reads_task = PypeTask(
        inputs={'falcon_asm_done': falcon_asm_done},
        outputs={
            'job_done': job_done,
            'ctg_list_file': ctg_list_file
        },
        parameters=parameters,
        wdir=wdir,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets()  #force refresh now, will put proper dependence later

    ctg_ids = []
    with open('./3-unzip/reads/ctg_list') as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))
        #mkdir(wd)
        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))
        job_done = makePypeLocalFile(
            os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id=ctg_id)))

        parameters = {
            'job_uid': 'aln-' + ctg_id,
            'wd': blasr_dir,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_blasr_aln'],
        }
        make_blasr_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'read_fasta': read_fasta
            },
            outputs={
                'ctg_aln_out': ctg_aln_out,
                'job_done': job_done
            },
            parameters=parameters,
        )
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)
    wf.refreshTargets()

    wf.max_jobs = unzip_phasing_concurrent_jobs
    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id=ctg_id))
        read_fasta = makePypeLocalFile(
            './3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id=ctg_id))

        # outputs
        wd = os.path.join(
            os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id=ctg_id))

        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(
            os.path.join(blasr_dir,
                         '{ctg_id}_sorted.bam'.format(ctg_id=ctg_id)))

        phasing_dir = os.path.join(wd, 'phasing')
        job_done = makePypeLocalFile(
            os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id=ctg_id)))
        rid_to_phase_out = makePypeLocalFile(
            os.path.join(
                wd,
                'rid_to_phase.{ctg_id}'.format(ctg_id=ctg_id)))  # TODO: ???
        all_ctg_out['r2p.{ctg_id}'.format(
            ctg_id=ctg_id)] = rid_to_phase_out  # implicit output?

        parameters = {
            'job_uid': 'ha-' + ctg_id,
            'wd': wd,
            'config': config,
            'ctg_id': ctg_id,
            'sge_option': config['sge_phasing'],
        }
        make_phasing_task = PypeTask(
            inputs={
                'ref_fasta': ref_fasta,
                'aln_bam': ctg_aln_out
            },
            outputs={'job_done': job_done},
            parameters=parameters,
        )
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)
    wf.refreshTargets()

    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
    #mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(
        os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
    task = PypeTask(
        inputs=all_ctg_out,
        outputs={'rid_to_phase_all': rid_to_phase_all},
    )(get_rid_to_phase_all)
    wf.addTask(task)

    parameters['wd'] = hasm_wd
    parameters['sge_option'] = config['sge_hasm']
    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
    make_hasm_task = PypeTask(
        inputs={'rid_to_phase_all': rid_to_phase_all},
        outputs={'job_done': job_done},
        parameters=parameters,
    )
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()
Example #13
0
import shlex
import os

def make_dirs(d):
    if not os.path.isdir(d):
        os.makedirs(d)

rawread_dir = os.path.abspath( "./0-rawreads" )
pread_dir = os.path.abspath( "./1-preads_ovl" )
asm_dir = os.path.abspath( os.path.join("./3-unzip/") )

read_map_dir = os.path.abspath(os.path.join(asm_dir, "read_maps"))
make_dirs(read_map_dir)

wf = PypeProcWatcherWorkflow(
        max_jobs=12,
)

rawread_db = makePypeLocalFile( os.path.join( rawread_dir, "raw_reads.db" ) )
rawread_id_file = makePypeLocalFile( os.path.join( rawread_dir, "raw_read_ids" ) )

@PypeTask( inputs = {"rawread_db": rawread_db}, 
           outputs =  {"rawread_id_file": rawread_id_file},
           TaskType = PypeThreadTaskBase,
           URL = "task://localhost/dump_rawread_ids" )
def dump_rawread_ids(self):
    rawread_db = fn( self.rawread_db )
    rawread_id_file = fn( self.rawread_id_file )
    os.system("DBshow -n %s | tr -d '>' | awk '{print $1}' > %s" % (rawread_db, rawread_id_file) )

wf.addTask( dump_rawread_ids )
def get_read_ctg_map(rawread_dir, pread_dir, asm_dir):
    read_map_dir = os.path.abspath(os.path.join(asm_dir, 'read_maps'))
    make_dirs(read_map_dir)

    wf = PypeProcWatcherWorkflow(
        max_jobs=12,
    )
    """
            job_type=config['job_type'],
            job_queue=config['job_queue'],
            sge_option=config.get('sge_option', ''),
            watcher_type=config['pwatcher_type'],
            watcher_directory=config['pwatcher_directory'])
    """

    rawread_db = makePypeLocalFile(os.path.join(rawread_dir, 'raw_reads.db'))
    rawread_id_file = makePypeLocalFile(os.path.join(
        read_map_dir, 'dump_rawread_ids', 'rawread_ids'))

    task = PypeTask(
        inputs={'rawread_db': rawread_db},
        outputs={'rawread_id_file': rawread_id_file},
    )
    wf.addTask(task(pype_tasks.task_dump_rawread_ids))

    pread_db = makePypeLocalFile(os.path.join(pread_dir, 'preads.db'))
    pread_id_file = makePypeLocalFile(os.path.join(
        read_map_dir, 'dump_pread_ids', 'pread_ids'))

    task = PypeTask(
        inputs={'pread_db': pread_db},
        outputs={'pread_id_file': pread_id_file},
    )
    wf.addTask(task(pype_tasks.task_dump_pread_ids))

    wf.refreshTargets()  # block

    sg_edges_list = makePypeLocalFile(os.path.join(asm_dir, 'sg_edges_list'))
    utg_data = makePypeLocalFile(os.path.join(asm_dir, 'utg_data'))
    ctg_paths = makePypeLocalFile(os.path.join(asm_dir, 'ctg_paths'))

    inputs = {'rawread_id_file': rawread_id_file,
              'pread_id_file': pread_id_file,
              'sg_edges_list': sg_edges_list,
              'utg_data': utg_data,
              'ctg_paths': ctg_paths}

    read_to_contig_map = makePypeLocalFile(os.path.join(
        read_map_dir, 'get_ctg_read_map', 'read_to_contig_map'))

    task = PypeTask(
        inputs=inputs,
        outputs={'read_to_contig_map': read_to_contig_map},
    )
    wf.addTask(task(pype_tasks.task_generate_read_to_ctg_map))

    wf.refreshTargets()  # block
def main(argv=sys.argv):
    global LOG
    LOG = support.setup_logger(None)


    if len(sys.argv) < 2:
        print>>sys.stderr, 'you need to provide a configuration file to specific a couple cluster running environment'
        sys.exit(1)

    config_fn = sys.argv[1]
    config_absbasedir = os.path.dirname(os.path.abspath(config_fn))

    config = ConfigParser.ConfigParser()
    config.read(config_fn)


    job_type = 'SGE'
    if config.has_option('General', 'job_type'):
        job_type = config.get('General', 'job_type')

    job_queue = 'default'
    if config.has_option('General', 'job_queue'):
        job_queue = config.get('General', 'job_queue')

    pwatcher_type = 'fs_based'
    if config.has_option('General', 'pwatcher_type'):
        pwatcher_type = config.get('General', 'pwatcher_type')

    sge_track_reads = ' -pe smp 12 -q bigmem'
    if config.has_option('Unzip', 'sge_track_reads'):
        sge_track_reads = config.get('Unzip', 'sge_track_reads')

    sge_quiver = ' -pe smp 24 -q bigmem '
    if config.has_option('Unzip', 'sge_quiver'):
        sge_quiver = config.get('Unzip', 'sge_quiver')

    smrt_bin = '/mnt/secondary/builds/full/3.0.0/prod/smrtanalysis_3.0.0.153854/smrtcmds/bin/'
    if config.has_option('Unzip', 'smrt_bin'):
        smrt_bin = config.get('Unzip', 'smrt_bin')

    input_bam_fofn = 'input_bam.fofn'
    if config.has_option('Unzip', 'input_bam_fofn'):
        input_bam_fofn = config.get('Unzip', 'input_bam_fofn')
    if not os.path.isabs(input_bam_fofn):
        input_bam_fofn = os.path.join(config_absbasedir, input_bam_fofn)


    quiver_concurrent_jobs = 8
    if config.has_option('Unzip', 'quiver_concurrent_jobs'):
        quiver_concurrent_jobs = config.getint('Unzip', 'quiver_concurrent_jobs')

    config = {'job_type': job_type,
              'job_queue': job_queue,
              'sge_quiver': sge_quiver,
              'sge_track_reads': sge_track_reads,
              'input_bam_fofn': input_bam_fofn,
              'pwatcher_type': pwatcher_type,
              'smrt_bin': smrt_bin}
    LOG.info('config={}'.format(pprint.pformat(config)))

    #support.job_type = 'SGE' #tmp hack until we have a configuration parser


    wf = PypeProcWatcherWorkflow(
            max_jobs=quiver_concurrent_jobs,
            job_type=config['job_type'],
            job_queue=config.get('job_queue'),
            sge_option=config.get('sge_option'),
            watcher_type=config.get('pwatcher_type'),
            #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
            use_tmpdir=config.get('use_tmpdir'),
    )

    abscwd = os.path.abspath('.')
    parameters = {
            'sge_option': config['sge_track_reads'],
    }
    input_bam_fofn_fn = config['input_bam_fofn']
    input_bam_fofn_plf = makePypeLocalFile(input_bam_fofn_fn)
    hasm_done_plf = makePypeLocalFile('./3-unzip/1-hasm/hasm_done') # by convention
    track_reads_h_done_plf = makePypeLocalFile('./4-quiver/reads/track_reads_h_done')
    make_track_reads_task = PypeTask(inputs = {
                                       'input_bam_fofn': input_bam_fofn_plf,
                                       'hasm_done': hasm_done_plf},
                                     outputs = {'job_done': track_reads_h_done_plf},
                                     parameters = parameters,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)

    scattered_quiver_plf = makePypeLocalFile('4-quiver/quiver_scatter/scattered.json')
    parameters = {
            'config': config,
    }
    make_task = PypeTask(
            inputs = {
                'p_ctg_fa': makePypeLocalFile('3-unzip/all_p_ctg.fa'),
                'h_ctg_fa': makePypeLocalFile('3-unzip/all_h_ctg.fa'),
                'track_reads_h_done': track_reads_h_done_plf,
            },
            outputs = {
                'scattered_quiver_json': scattered_quiver_plf,
            },
            parameters = parameters,
    )
    wf.addTask(make_task(task_scatter_quiver))
    wf.refreshTargets()

    p_ctg_out, h_ctg_out, job_done_plfs = create_quiver_jobs(wf, scattered_quiver_plf)

    gathered_p_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/p_ctg.txt')
    gathered_h_ctg_plf = makePypeLocalFile('4-quiver/cns_gather/h_ctg.txt')
    gather_done_plf = makePypeLocalFile('4-quiver/cns_gather/job_done')
    mkdir('4-quiver/cns_gather')
    with open(fn(gathered_p_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(p_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))
    with open(fn(gathered_h_ctg_plf), 'w') as ifs:
        for cns_fasta_fn, cns_fastq_fn in sorted(h_ctg_out):
            ifs.write('{} {}\n'.format(cns_fasta_fn, cns_fastq_fn))

    make_task = PypeTask(
            inputs = job_done_plfs,
            outputs = {
                'job_done': gather_done_plf,
            },
            parameters = {},
    )
    wf.addTask(make_task(task_gather_quiver))
    wf.refreshTargets()

    cns_p_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fasta')
    cns_p_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_p_ctg.fastq')
    cns_h_ctg_fasta_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fasta')
    cns_h_ctg_fastq_plf = makePypeLocalFile('4-quiver/cns_output/cns_h_ctg.fastq')
    zcat_done_plf = makePypeLocalFile('4-quiver/cns_output/job_done')
    make_task = PypeTask(
            inputs = {
                'gathered_p_ctg': gathered_p_ctg_plf,
                'gathered_h_ctg': gathered_h_ctg_plf,
                'gather_done': gather_done_plf,
            },
            outputs = {
                'cns_p_ctg_fasta': cns_p_ctg_fasta_plf,
                'cns_p_ctg_fastq': cns_p_ctg_fastq_plf,
                'cns_h_ctg_fasta': cns_h_ctg_fasta_plf,
                'cns_h_ctg_fastq': cns_h_ctg_fastq_plf,
                'job_done': zcat_done_plf,
            },
    )
    wf.addTask(make_task(task_cns_zcat))

    wf.refreshTargets()
Example #16
0
def unzip_all(config):
    unzip_blasr_concurrent_jobs = config['unzip_blasr_concurrent_jobs']
    unzip_phasing_concurrent_jobs = config['unzip_phasing_concurrent_jobs']
    wf = PypeProcWatcherWorkflow(
            max_jobs=unzip_blasr_concurrent_jobs,
            job_type=config['job_type'],
            job_queue=config.get('job_queue'),
            sge_option=config.get('sge_option'),
            watcher_type=config.get('pwatcher_type'),
            #watcher_directory=config.get('pwatcher_directory', 'mypwatcher'),
            use_tmpdir=config.get('use_tmpdir'),
    )

    ctg_list_file = makePypeLocalFile('./3-unzip/reads/ctg_list')
    falcon_asm_done = makePypeLocalFile('./2-asm-falcon/falcon_asm_done')
    wdir = os.path.abspath('./3-unzip/reads')
    parameters = {'wd': wdir, 'config': config,
            'sge_option': config['sge_track_reads'],
    }
    job_done = makePypeLocalFile(os.path.join(parameters['wd'], 'track_reads_done'))
    make_track_reads_task = PypeTask(inputs = {'falcon_asm_done': falcon_asm_done},
                                     outputs = {'job_done': job_done, 'ctg_list_file': ctg_list_file},
                                     parameters = parameters,
                                     wdir = wdir,
    )
    track_reads_task = make_track_reads_task(task_track_reads)

    wf.addTask(track_reads_task)
    wf.refreshTargets() #force refresh now, will put proper dependence later

    ctg_ids = []
    with open('./3-unzip/reads/ctg_list') as f:
        for row in f:
            row = row.strip()
            ctg_ids.append(row)

    aln1_outs = {}

    all_ctg_out = {}

    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))

        # outputs
        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))
        #mkdir(wd)
        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))
        job_done = makePypeLocalFile(os.path.join(blasr_dir, 'aln_{ctg_id}_done'.format(ctg_id = ctg_id)))

        parameters = {'job_uid':'aln-'+ctg_id, 'wd': blasr_dir, 'config':config, 'ctg_id': ctg_id,
                'sge_option': config['sge_blasr_aln'],
        }
        make_blasr_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'read_fasta': read_fasta},
                                   outputs = {'ctg_aln_out': ctg_aln_out, 'job_done': job_done},
                                   parameters = parameters,
        )
        blasr_task = make_blasr_task(task_run_blasr)
        aln1_outs[ctg_id] = (ctg_aln_out, job_done)
        wf.addTask(blasr_task)
    wf.refreshTargets()

    wf.max_jobs = unzip_phasing_concurrent_jobs
    for ctg_id in ctg_ids:
        # inputs
        ref_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_ref.fa'.format(ctg_id = ctg_id))
        read_fasta = makePypeLocalFile('./3-unzip/reads/{ctg_id}_reads.fa'.format(ctg_id = ctg_id))

        # outputs
        wd = os.path.join(os.getcwd(), './3-unzip/0-phasing/{ctg_id}/'.format(ctg_id = ctg_id))

        blasr_dir = os.path.join(wd, 'blasr')
        ctg_aln_out = makePypeLocalFile(os.path.join(blasr_dir, '{ctg_id}_sorted.bam'.format(ctg_id = ctg_id)))

        phasing_dir = os.path.join(wd, 'phasing')
        job_done = makePypeLocalFile(os.path.join(phasing_dir, 'p_{ctg_id}_done'.format(ctg_id = ctg_id)))
        rid_to_phase_out = makePypeLocalFile(os.path.join(wd, 'rid_to_phase.{ctg_id}'.format(ctg_id = ctg_id))) # TODO: ???
        all_ctg_out[ 'r2p.{ctg_id}'.format(ctg_id = ctg_id) ] = rid_to_phase_out # implicit output?

        parameters = {'job_uid':'ha-'+ctg_id, 'wd': wd, 'config':config, 'ctg_id': ctg_id,
                'sge_option': config['sge_phasing'],
        }
        make_phasing_task = PypeTask(inputs = {'ref_fasta': ref_fasta, 'aln_bam':ctg_aln_out},
                                   outputs = {'job_done': job_done},
                                   parameters = parameters,
        )
        phasing_task = make_phasing_task(task_phasing)
        wf.addTask(phasing_task)
    wf.refreshTargets()

    hasm_wd = os.path.abspath('./3-unzip/1-hasm/')
    #mkdir(hasm_wd)
    rid_to_phase_all = makePypeLocalFile(os.path.join(hasm_wd, 'rid-to-phase-all', 'rid_to_phase.all'))
    task = PypeTask(inputs = all_ctg_out, outputs = {'rid_to_phase_all': rid_to_phase_all},
    ) (get_rid_to_phase_all)
    wf.addTask(task)

    parameters['wd'] = hasm_wd
    parameters['sge_option'] = config['sge_hasm']
    job_done = makePypeLocalFile(os.path.join(hasm_wd, 'hasm_done'))
    make_hasm_task = PypeTask(inputs = {'rid_to_phase_all': rid_to_phase_all},
                              outputs = {'job_done': job_done},
                              parameters = parameters,
    )
    hasm_task = make_hasm_task(task_hasm)

    wf.addTask(hasm_task)

    wf.refreshTargets()