Exemple #1
0
from cosmos.api import find, out_dir
from genomekey.api import can_stream
from genomekey.api import get_env
s = get_env().config


def list_to_input(l):
    return " ".join('INPUT=%s' % x for x in l)


def picard(time_req=8 * 60, mem_req=3 * 1024, extra_java_args=''):
    return 'java{extra_java_args} ' \
           '-Xmx{mem_req2}m -Djava.io.tmpdir={s[gk][tmp_dir]} ' \
           '-Dsnappy.loader.verbosity=true ' \
           '-jar {s[opt][picard]}'.format(s=s,
                                          mem_req2=int(mem_req * .8),
                                          **locals())


# @can_stream([''])
def mark_duplicates(
    core_req=4,  # for scratch space
    mem_req=12 * 1024,
    in_bams=find('bam$', n='>=1'),
    in_bais=find('bai$', n='>=1'),
    out_bam=out_dir('deduped.bam'),
    out_bai=out_dir('deduped.bam.bai'),
    out_metrics=out_dir('deduped.metrics')):
    return r"""
        {picard} MarkDuplicates \
        {inputs} \
Exemple #2
0
from cosmos.api import find, out_dir
from genomekey.api import get_env
s = get_env().config


def merge(in_bams=find('bam$', n='>0'),
          out_bam=out_dir('merged.bam')):
    if len(in_bams) == 1:
        # Can't merge 1 bam, just copy it
        return r"""
        cp {in_bams[0]} {out_bam}
        """.format(**locals())
    else:
        in_bams = ' '.join(map(str, in_bams))
        return r"""
            {s[opt][samtools]} merge -f {out_bam} {in_bams}
        """.format(s=s, **locals())

def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')):
    return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s,
                                                                           **locals())
Exemple #3
0
from .util import parse_inputs
from genomekey.api import get_env, make_s3_cmd_fxn_wrapper, s3cmd, s3run, shared_fs_cmd_fxn_wrapper

env = get_env()

from ..tools import bwa, picard, gatk, fastqc, bed, fastq, samtools
from . import util
# from genomekey.bin.fastq.split_fastq_file import get_split_paths
from genomekey.aws.s3 import cmd as s3cmd

from cosmos.api import one2one, many2one, out_dir, group, load_input, Execution, make_dict, bash_call
from cosmos.core.cmd_fxn.signature import default_cmd_fxn_wrapper
import os
import math

opj = os.path.join
import subprocess as sp

FASTQ_MAX_CHUNK_SIZE = 2**30 / 2
FASTQ_MAX_CHUNK_SIZE = 2**20  # 1Mb for testing


def mkdir(p, root_path=None):
    root_path = root_path if root_path else os.getcwd()
    sp.check_call('cd %s; mkdir -p %s' % (root_path, p), shell=True)


def run_germline(execution,
                 max_cores,
                 max_attempts,
                 target_bed,
Exemple #4
0
from .util import parse_inputs
from genomekey.api import get_env, can_stream
from genomekey.aws.s3 import run as s3run
from genomekey.aws.s3 import cmd as s3cmd
from genomekey.aws.s3.cosmos_utils import make_s3_cmd_fxn_wrapper, shared_fs_cmd_fxn_wrapper

env = get_env()

from ..tools import bwa, picard, gatk, fastqc, bed, fastq, samtools
from . import util
# from genomekey.bin.fastq.split_fastq_file import get_split_paths
from genomekey.aws.s3 import cmd as s3cmd

from cosmos.api import one2one, many2one, out_dir, group, load_input, Execution, make_dict, bash_call
from cosmos.core.cmd_fxn.signature import default_cmd_fxn_wrapper
import os
import math

opj = os.path.join
import subprocess as sp

FASTQ_MAX_CHUNK_SIZE = 2 ** 30 / 2
FASTQ_MAX_CHUNK_SIZE = 2 ** 20  # 1Mb for testing


def mkdir(p, root_path=None):
    root_path = root_path if root_path else os.getcwd()
    sp.check_call('cd %s; mkdir -p %s' % (root_path, p), shell=True)


def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None):
Exemple #5
0
def make_s3_cmd_fxn_wrapper(s3_path):
    s = get_env().config
    def s3_cmd_fxn_wrapper(task, stage_name, input_map, output_map):
        """
        Create and cd into a tmp dir
        Create the task's output_dir
        Pull inputs from S3
        Run the command
        Push outputs to S3
        delete the tmp_dir
        """

        def wrapped(fxn, *args, **kwargs):
            """
            1) If input starts with s3:// or s3_path is set, pull or stream
            3) If s3_path is set, push outputs to S3
            """
            fxn_sig = funcsigs.signature(fxn)

            # for some reason decorator.decorator is using args instead of kwargs..
            # repopulate kwargs manually.
            for i, (k, parameter) in enumerate(fxn_sig.parameters.items()):
                kwargs[k] = args[i]

            # HANDLE INPUTS

            def process_input_map():
                # TODO this function should probably be refactored for readability
                def process_input_value(input_param_name, file_path):
                    """
                    :returns: s3_copy_command, new_input_value
                    """
                    can_stream = input_param_name in getattr(fxn, 'can_stream', [])
                    if file_path.startswith('s3://'):
                        local_path = 'tmp-%s__%s' % (random_str(6), os.path.basename(file_path))
                        s3_pull_path = file_path
                    else:
                        local_path = file_path
                        s3_pull_path = opj(s3_path, file_path)

                    if can_stream:
                        return None, s3cmd.stream_in(s3_pull_path)
                        # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path
                    else:
                        # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted)
                        return s3cmd.cp(s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path

                s3_pull_all_inputs = []

                def skip_bai(input_value):
                    # since we are going to be slicing, we do not want the bai
                    return task.tags.get('contig') and input_value.endswith('.bai')

                for input_name, input_value in input_map.items():
                    if isinstance(input_value, list):
                        if skip_bai(input_value[0]): continue
                        cp_cmds, new_input_value_list = zip(*(process_input_value(input_name, iv) for iv in input_value))
                        s3_pull_all_inputs += cp_cmds
                        kwargs[input_name] = new_input_value_list
                    else:
                        if skip_bai(input_value): continue
                        cp_cmd, new_input_value = process_input_value(input_name, input_value)
                        s3_pull_all_inputs.append(cp_cmd)
                        kwargs[input_name] = new_input_value

                fifo_lines, pull_lines = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs))

                return list(fifo_lines), list(pull_lines)

            fifo_lines, s3_pull_cmds = process_input_map()
            # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1}  2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len(
            # s3_push_cmds) else ''
            fifo_cmds = "\n".join('%s &' % l for l in fifo_lines)


            # HANDLE OUTPUTS

            def gen_pushes():
                if s3_path:
                    for output_name, output_vals in output_map.items():
                        if isinstance(fxn_sig.parameters[output_name].default, Forward):
                            # do not s3 push if this is a forwarded input
                            continue
                        else:
                            if not isinstance(output_vals, list):
                                output_vals = [output_vals]

                            for out_val in output_vals:
                                local_path = out_val
                                s3_push_path = opj(s3_path, out_val)
                                if output_name in getattr(fxn, 'can_stream', []):
                                    # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path))
                                    kwargs[output_name] = s3cmd.stream_out(opj(s3_path, out_val))
                                    # do not push since we're streaming
                                else:
                                    yield s3cmd.cp(out_val, s3_push_path)

            fifo_lines, s3_push_cmds = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, gen_pushes()))
            fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds)
            # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1}  2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len(
            # cp_lines) else ''
            fifo_cmds += "\n".join('%s &' % l for l in fifo_lines) + "\n" if len(fifo_lines) else ''

            r = fxn(**kwargs)
            # print prepend, r, append
            # if 'bwa' in fxn.__name__:
            # raise



            if r == NOOP:
                return NOOP
            else:
                return jinja2.Template("""#!/bin/bash
set -e
set -o pipefail

TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX`
trap "rm -rf $TMP_DIR" EXIT

echo "Running on host: `hostname`"
echo "Created temp dir: $TMP_DIR"
echo "Mount space before pull: `df -h |grep scratch`"

cd $TMP_DIR
{{make_output_dir}}

{{s3pull}}

echo "S3 Pulled data size:" `du -hs .`
echo "Mount space before after pull: `df -h |grep scratch`"

{{r}}

{{ s3push }}
""").render(tmp_dir=s['gk']['tmp_dir'],
            s3pull=parallel.render(cmds=s3_pull_cmds),
            s3push=parallel.render(cmds=s3_push_cmds),
            stage_name=stage_name,
            fifo_cmds=fifo_cmds,
            r=r,
            s3_pull_cmds=s3_pull_cmds,
            s3_push_cmds=s3_push_cmds,
            make_output_dir='mkdir -p %s\n' % task.output_dir if task.output_dir and task.output_dir != '' else '')

        return decorator.decorator(wrapped)


    return s3_cmd_fxn_wrapper
Exemple #6
0
def assert_references_exist():
    s = get_env().config
    for k, v in s['ref'].items():
        assert os.path.exists(v), 'Reference file missing! %s = %s' % (k, v)
Exemple #7
0
def make_s3_cmd_fxn_wrapper(s3_path):
    s = get_env().config

    def s3_cmd_fxn_wrapper(task, stage_name, input_map, output_map):
        """
        Create and cd into a tmp dir
        Create the task's output_dir
        Pull inputs from S3
        Run the command
        Push outputs to S3
        delete the tmp_dir
        """
        def wrapped(fxn, *args, **kwargs):
            """
            1) If input starts with s3:// or s3_path is set, pull or stream
            3) If s3_path is set, push outputs to S3
            """
            fxn_sig = funcsigs.signature(fxn)

            # for some reason decorator.decorator is using args instead of kwargs..
            # repopulate kwargs manually.
            for i, (k, parameter) in enumerate(fxn_sig.parameters.items()):
                kwargs[k] = args[i]

            # HANDLE INPUTS

            def process_input_map():
                # TODO this function should probably be refactored for readability
                def process_input_value(input_param_name, file_path):
                    """
                    :returns: s3_copy_command, new_input_value
                    """
                    can_stream = input_param_name in getattr(
                        fxn, 'can_stream', [])
                    if file_path.startswith('s3://'):
                        local_path = 'tmp-%s__%s' % (
                            random_str(6), os.path.basename(file_path))
                        s3_pull_path = file_path
                    else:
                        local_path = file_path
                        s3_pull_path = opj(s3_path, file_path)

                    if can_stream:
                        return None, s3cmd.stream_in(s3_pull_path)
                        # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path
                    else:
                        # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted)
                        return s3cmd.cp(
                            s3_pull_path,
                            local_path,
                            chrom=task.tags.get('contig')), local_path

                s3_pull_all_inputs = []

                def skip_bai(input_value):
                    # since we are going to be slicing, we do not want the bai
                    return task.tags.get('contig') and input_value.endswith(
                        '.bai')

                for input_name, input_value in input_map.items():
                    if isinstance(input_value, list):
                        if skip_bai(input_value[0]): continue
                        cp_cmds, new_input_value_list = zip(
                            *(process_input_value(input_name, iv)
                              for iv in input_value))
                        s3_pull_all_inputs += cp_cmds
                        kwargs[input_name] = new_input_value_list
                    else:
                        if skip_bai(input_value): continue
                        cp_cmd, new_input_value = process_input_value(
                            input_name, input_value)
                        s3_pull_all_inputs.append(cp_cmd)
                        kwargs[input_name] = new_input_value

                fifo_lines, pull_lines = partition(
                    lambda cmd: cmd.startswith('mkfifo'),
                    filter(bool, s3_pull_all_inputs))

                return list(fifo_lines), list(pull_lines)

            fifo_lines, s3_pull_cmds = process_input_map()
            # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1}  2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len(
            # s3_push_cmds) else ''
            fifo_cmds = "\n".join('%s &' % l for l in fifo_lines)

            # HANDLE OUTPUTS

            def gen_pushes():
                if s3_path:
                    for output_name, output_vals in output_map.items():
                        if isinstance(fxn_sig.parameters[output_name].default,
                                      Forward):
                            # do not s3 push if this is a forwarded input
                            continue
                        else:
                            if not isinstance(output_vals, list):
                                output_vals = [output_vals]

                            for out_val in output_vals:
                                local_path = out_val
                                s3_push_path = opj(s3_path, out_val)
                                if output_name in getattr(
                                        fxn, 'can_stream', []):
                                    # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path))
                                    kwargs[output_name] = s3cmd.stream_out(
                                        opj(s3_path, out_val))
                                    # do not push since we're streaming
                                else:
                                    yield s3cmd.cp(out_val, s3_push_path)

            fifo_lines, s3_push_cmds = partition(
                lambda cmd: cmd.startswith('mkfifo'),
                filter(bool, gen_pushes()))
            fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds)
            # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1}  2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len(
            # cp_lines) else ''
            fifo_cmds += "\n".join(
                '%s &' % l
                for l in fifo_lines) + "\n" if len(fifo_lines) else ''

            r = fxn(**kwargs)
            # print prepend, r, append
            # if 'bwa' in fxn.__name__:
            # raise

            if r == NOOP:
                return NOOP
            else:
                return jinja2.Template("""#!/bin/bash
set -e
set -o pipefail

TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX`
trap "rm -rf $TMP_DIR" EXIT

echo "Running on host: `hostname`"
echo "Created temp dir: $TMP_DIR"
echo "Mount space before pull: `df -h |grep scratch`"

cd $TMP_DIR
{{make_output_dir}}

{{s3pull}}

echo "S3 Pulled data size:" `du -hs .`
echo "Mount space before after pull: `df -h |grep scratch`"

{{r}}

{{ s3push }}
""").render(tmp_dir=s['gk']['tmp_dir'],
                s3pull=parallel.render(cmds=s3_pull_cmds),
                s3push=parallel.render(cmds=s3_push_cmds),
                stage_name=stage_name,
                fifo_cmds=fifo_cmds,
                r=r,
                s3_pull_cmds=s3_pull_cmds,
                s3_push_cmds=s3_push_cmds,
                make_output_dir='mkdir -p %s\n' % task.output_dir
                if task.output_dir and task.output_dir != '' else '')

        return decorator.decorator(wrapped)

    return s3_cmd_fxn_wrapper
Exemple #8
0
def assert_references_exist():
    s = get_env().config
    for k,v in s['ref'].items():
        assert os.path.exists(v),  'Reference file missing! %s = %s' % (k,v)