Ejemplo n.º 1
0
def merge_split(
    las_paths_fn,
    dbname="raw_reads",
):

    las_paths = io_io.deserialize(las_paths_fn)

    re_las_pair = re.compile(
        r'{db}\.(\d+)\.{db}\.(\d+)\.las$'.format(db=dbname))
    las_map = collections.defaultdict(list)
    for path in las_paths:
        mo = re_las_pair.search(path)
        if not mo:
            msg = '{!r} does not match regex {!r}'.format(
                path, re_las_pair.pattern)
            raise Exception(msg)
        a, b = int(mo.group(1)), int(mo.group(2))
        las_map[a].append(path)

    for i, block in enumerate(las_map):
        job_id = 'm_{:05d}'.format(i)

        # Write the las files for this job.
        input_dir = os.path.join('merge-scripts', job_id)
        las_paths_fn = os.path.join('.', input_dir, 'las-paths.json')
        io_io.mkdirs(input_dir)
        las_paths = las_map[block]
        io_io.serialize(las_paths_fn, las_paths)

        las_name = os.path.join('.', input_dir, 'las_fn')
        las_fn = '{}.{}.las'.format("L" + dbname, block)
        with open(las_name, "w") as f:
            f.writelines(las_fn)
Ejemplo n.º 2
0
def rep_split(las_paths_fn, group_size, coverage_limit):
    """For foo.db, HPC.REPmask would produce rep-jobs.05.MASK lines like this:

    # REPmask jobs (n)
    REPmask -v -c30 -nrep1 foo foo.R1.@1-3
    REPmask -v -c30 -nrep1 foo foo.R1.@4-6
    ...

    (That's for level R1.)
    We will do one block at-a-time, for simplicity.
    """

    las_paths = io_io.deserialize(las_paths_fn)

    scripts = list()
    for i, las_fn in enumerate(las_paths):
        las_files = las_fn  # one at-a-time
        script_lines = [
            'set -vex\n',
            #'LAcheck {} {}\n'.format(db, las_files),
            'REPmask -v -c{} -nrep{} {} {}\n'.format(coverage_limit,
                                                     group_size, "raw_reads",
                                                     las_files),
            '#rm -f {}\n'.format(las_files),
        ]
        scripts.append(''.join(script_lines))

    for i, script in enumerate(scripts):
        job_id = 'rep_{:03d}'.format(i)
        script_dir = os.path.join('.', 'rep-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_REPmask.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write('{}\n'.format(script))
Ejemplo n.º 3
0
def rep_daligner_split(REPmask_opt, db_fn,  group_size, coverage_limit):
    """Similar to daligner_split(), but based on HPC.REPmask instead of HPC.daligner.
    """



    scripts = _get_rep_daligner_split_scripts(REPmask_opt, db_fn, group_size, coverage_limit)

    for i, script in enumerate(scripts):
        job_id = 'rep_{:04d}'.format(i)
        script_dir = os.path.join('.', 'rep-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_daligner.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write('{}\n'.format(script))
Ejemplo n.º 4
0
def daligner_split(daligner_opt, db_fn, length_cutoff_fn):
    db = os.path.splitext(db_fn)[0]
    dbname = os.path.basename(db)

    tracks = get_tracks(db_fn)

    script = ''.join([
        script_HPC_daligner(daligner_opt,
                            db,
                            length_cutoff_fn,
                            tracks,
                            prefix='daligner-jobs'),
    ])
    script_fn = 'split_db.sh'
    with open(script_fn, 'w') as ofs:
        exe = bash.write_sub_script(ofs, script)
    io_io.syscall('bash -vex {}'.format(script_fn))

    # We now have files like daligner-jobs.01.OVL
    # We need to parse that one. (We ignore the others.)
    lines = open('daligner-jobs.01.OVL').readlines()

    preads_aln = True if dbname == 'preads' else False
    xformer = functional.get_script_xformer(preads_aln)
    LOG.debug('preads_aln={!r} (True => use daligner_p)'.format(preads_aln))

    scripts = list()
    for line in lines:
        if line.startswith('#'):
            continue
        if not line.strip():
            continue
        line = xformer(line)  # Use daligner_p for preads.
        scripts.append(line)
    """
    Special case:
        # Daligner jobs (1)
        daligner raw_reads raw_reads && mv raw_reads.raw_reads.las raw_reads.las
    In that case, the "block" name is empty. (See functional.py)
    We will rename the file. (LAmerge on a single input is a no-op, which is fine.)
    """
    if len(scripts) == 1:
        script = scripts[0]
        re_script = re.compile(
            r'(mv\b.*\S+\s+)(\S+)$')  # no trailing newline, for now
        mo = re_script.search(script)
        if not mo:
            msg = 'Only 1 line in daligner-jobs.01.OVL, but\n {!r} did not match\n {!r}.'.format(
                re_script.pattern, script)
            LOG.warning(msg)
        else:
            new_script = re_script.sub(
                r'\1{dbname}.1.{dbname}.1.las'.format(dbname=dbname), script,
                1)
            msg = 'Only 1 line in daligner-jobs.01.OVL:\n {!r} matches\n {!r}. Replacing with\n {!r}.'.format(
                re_script.pattern, script, new_script)
            LOG.warning(msg)
            scripts = [new_script]

    for i, script in enumerate(scripts):
        LAcheck = 'LAcheck -vS {} *.las'.format(db)
        script += '\n' + LAcheck + '\n'
        scripts[i] = "set -vex\n" + script

    for i, script in enumerate(scripts):
        job_id = 'j_{:04d}'.format(i)
        script_dir = os.path.join('.', 'daligner-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_daligner.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write(script)
Ejemplo n.º 5
0
def tan_split(tanmask_opt, db_fn, uows_fn, bash_template_fn):
    with open(bash_template_fn, 'w') as stream:
        stream.write(
            "python -m falcon_kit.mains.dazzler --config={input.config} --db={input.db}  tan-split --split={output.split} --bash-template={output.bash_template}"
        )
    # TANmask would put track-files in the DB-directory, not '.',
    # so we need to symlink everything first.
    db = symlink_db(db_fn)

    script = ''.join([
        script_HPC_TANmask(tanmask_opt, db, prefix='tan-jobs'),
    ])
    script_fn = 'split_db.sh'
    with open(script_fn, 'w') as ofs:
        exe = bash.write_sub_script(ofs, script)
    io_io.syscall('bash -vex {}'.format(script_fn))

    # We now have files like tan-jobs.01.OVL
    # We need to parse that one. (We ignore the others.)
    lines = open('tan-jobs.01.OVL').readlines()

    re_block = re.compile(r'{}(\.\d+|)'.format(db))

    def get_blocks(line):
        """Return ['.1', '.2', ...]
        """
        return [mo.group(1) for mo in re_block.finditer(line)]

    scripts = list()
    for line in lines:
        if line.startswith('#'):
            continue
        if not line.strip():
            continue
        blocks = get_blocks(line)
        assert blocks, 'No blocks found in {!r} from {!r}'.format(
            line, 'tan-jobs.01.OVL')
        las_files = ' '.join('TAN.{db}{block}.las'.format(db=db, block=block)
                             for block in blocks)
        script_lines = [
            line,
            'LAcheck {} {}\n'.format(db, las_files),
            'TANmask {} {}\n'.format(db, las_files),
            'rm -f {}\n'.format(las_files),
        ]
        if [''] == blocks:
            # special case -- If we have only 1 block, then HPC.TANmask fails to use the block-number.
            # However, if there are multiple blocks, it is still possible for a single line to have
            # only 1 block. So we look for a solitary block that is '', and we symlink the .las to pretend
            # that it was named properly in the first place.
            script_lines.append(
                'mv .{db}.tan.data .{db}.1.tan.data\n'.format(db=db))
            script_lines.append(
                'mv .{db}.tan.anno .{db}.1.tan.anno\n'.format(db=db))
        scripts.append(''.join(script_lines))
    db_dir = os.path.dirname(db_fn)

    for i, script in enumerate(scripts):
        bash_script = """
db_dir={db_dir}
ln -sf ${{db_dir}}/.{db_prefix}.bps .
ln -sf ${{db_dir}}/.{db_prefix}.idx .
ln -sf ${{db_dir}}/{db_prefix}.db .
ln -sf ${{db_dir}}/.{db_prefix}.dust.anno .
ln -sf ${{db_dir}}/.{db_prefix}.dust.data .
{script}
""".format(db_dir=db_dir, db_prefix="raw_reads", script=script)
        job_id = 'tan_{:03d}'.format(i)
        script_dir = os.path.join('.', 'tan-scripts', job_id)
        script_fn = os.path.join(script_dir, 'run_datander.sh')
        io_io.mkdirs(script_dir)
        with open(script_fn, 'w') as stream:
            stream.write('{}\n'.format(bash_script))