from cosmos.api import find, out_dir from genomekey.api import can_stream from genomekey.api import get_env s = get_env().config def list_to_input(l): return " ".join('INPUT=%s' % x for x in l) def picard(time_req=8 * 60, mem_req=3 * 1024, extra_java_args=''): return 'java{extra_java_args} ' \ '-Xmx{mem_req2}m -Djava.io.tmpdir={s[gk][tmp_dir]} ' \ '-Dsnappy.loader.verbosity=true ' \ '-jar {s[opt][picard]}'.format(s=s, mem_req2=int(mem_req * .8), **locals()) # @can_stream(['']) def mark_duplicates( core_req=4, # for scratch space mem_req=12 * 1024, in_bams=find('bam$', n='>=1'), in_bais=find('bai$', n='>=1'), out_bam=out_dir('deduped.bam'), out_bai=out_dir('deduped.bam.bai'), out_metrics=out_dir('deduped.metrics')): return r""" {picard} MarkDuplicates \ {inputs} \
from cosmos.api import find, out_dir from genomekey.api import get_env s = get_env().config def merge(in_bams=find('bam$', n='>0'), out_bam=out_dir('merged.bam')): if len(in_bams) == 1: # Can't merge 1 bam, just copy it return r""" cp {in_bams[0]} {out_bam} """.format(**locals()) else: in_bams = ' '.join(map(str, in_bams)) return r""" {s[opt][samtools]} merge -f {out_bam} {in_bams} """.format(s=s, **locals()) def view(f, in_bam=find('bam$'), out_bam=out_dir('reads.bam')): return '{s[opt][samtools]} view -f {f} -h {in_bam} > {out_bam}'.format(s=s, **locals())
from .util import parse_inputs from genomekey.api import get_env, make_s3_cmd_fxn_wrapper, s3cmd, s3run, shared_fs_cmd_fxn_wrapper env = get_env() from ..tools import bwa, picard, gatk, fastqc, bed, fastq, samtools from . import util # from genomekey.bin.fastq.split_fastq_file import get_split_paths from genomekey.aws.s3 import cmd as s3cmd from cosmos.api import one2one, many2one, out_dir, group, load_input, Execution, make_dict, bash_call from cosmos.core.cmd_fxn.signature import default_cmd_fxn_wrapper import os import math opj = os.path.join import subprocess as sp FASTQ_MAX_CHUNK_SIZE = 2**30 / 2 FASTQ_MAX_CHUNK_SIZE = 2**20 # 1Mb for testing def mkdir(p, root_path=None): root_path = root_path if root_path else os.getcwd() sp.check_call('cd %s; mkdir -p %s' % (root_path, p), shell=True) def run_germline(execution, max_cores, max_attempts, target_bed,
from .util import parse_inputs from genomekey.api import get_env, can_stream from genomekey.aws.s3 import run as s3run from genomekey.aws.s3 import cmd as s3cmd from genomekey.aws.s3.cosmos_utils import make_s3_cmd_fxn_wrapper, shared_fs_cmd_fxn_wrapper env = get_env() from ..tools import bwa, picard, gatk, fastqc, bed, fastq, samtools from . import util # from genomekey.bin.fastq.split_fastq_file import get_split_paths from genomekey.aws.s3 import cmd as s3cmd from cosmos.api import one2one, many2one, out_dir, group, load_input, Execution, make_dict, bash_call from cosmos.core.cmd_fxn.signature import default_cmd_fxn_wrapper import os import math opj = os.path.join import subprocess as sp FASTQ_MAX_CHUNK_SIZE = 2 ** 30 / 2 FASTQ_MAX_CHUNK_SIZE = 2 ** 20 # 1Mb for testing def mkdir(p, root_path=None): root_path = root_path if root_path else os.getcwd() sp.check_call('cd %s; mkdir -p %s' % (root_path, p), shell=True) def run_germline(execution, max_cores, max_attempts, target_bed, input_path=None, s3fs=None):
def make_s3_cmd_fxn_wrapper(s3_path): s = get_env().config def s3_cmd_fxn_wrapper(task, stage_name, input_map, output_map): """ Create and cd into a tmp dir Create the task's output_dir Pull inputs from S3 Run the command Push outputs to S3 delete the tmp_dir """ def wrapped(fxn, *args, **kwargs): """ 1) If input starts with s3:// or s3_path is set, pull or stream 3) If s3_path is set, push outputs to S3 """ fxn_sig = funcsigs.signature(fxn) # for some reason decorator.decorator is using args instead of kwargs.. # repopulate kwargs manually. for i, (k, parameter) in enumerate(fxn_sig.parameters.items()): kwargs[k] = args[i] # HANDLE INPUTS def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr(fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % (random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp(s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith('.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip(*(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value(input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines) fifo_lines, s3_pull_cmds = process_input_map() # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len( # s3_push_cmds) else '' fifo_cmds = "\n".join('%s &' % l for l in fifo_lines) # HANDLE OUTPUTS def gen_pushes(): if s3_path: for output_name, output_vals in output_map.items(): if isinstance(fxn_sig.parameters[output_name].default, Forward): # do not s3 push if this is a forwarded input continue else: if not isinstance(output_vals, list): output_vals = [output_vals] for out_val in output_vals: local_path = out_val s3_push_path = opj(s3_path, out_val) if output_name in getattr(fxn, 'can_stream', []): # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path)) kwargs[output_name] = s3cmd.stream_out(opj(s3_path, out_val)) # do not push since we're streaming else: yield s3cmd.cp(out_val, s3_push_path) fifo_lines, s3_push_cmds = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, gen_pushes())) fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds) # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len( # cp_lines) else '' fifo_cmds += "\n".join('%s &' % l for l in fifo_lines) + "\n" if len(fifo_lines) else '' r = fxn(**kwargs) # print prepend, r, append # if 'bwa' in fxn.__name__: # raise if r == NOOP: return NOOP else: return jinja2.Template("""#!/bin/bash set -e set -o pipefail TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX` trap "rm -rf $TMP_DIR" EXIT echo "Running on host: `hostname`" echo "Created temp dir: $TMP_DIR" echo "Mount space before pull: `df -h |grep scratch`" cd $TMP_DIR {{make_output_dir}} {{s3pull}} echo "S3 Pulled data size:" `du -hs .` echo "Mount space before after pull: `df -h |grep scratch`" {{r}} {{ s3push }} """).render(tmp_dir=s['gk']['tmp_dir'], s3pull=parallel.render(cmds=s3_pull_cmds), s3push=parallel.render(cmds=s3_push_cmds), stage_name=stage_name, fifo_cmds=fifo_cmds, r=r, s3_pull_cmds=s3_pull_cmds, s3_push_cmds=s3_push_cmds, make_output_dir='mkdir -p %s\n' % task.output_dir if task.output_dir and task.output_dir != '' else '') return decorator.decorator(wrapped) return s3_cmd_fxn_wrapper
def assert_references_exist(): s = get_env().config for k, v in s['ref'].items(): assert os.path.exists(v), 'Reference file missing! %s = %s' % (k, v)
def make_s3_cmd_fxn_wrapper(s3_path): s = get_env().config def s3_cmd_fxn_wrapper(task, stage_name, input_map, output_map): """ Create and cd into a tmp dir Create the task's output_dir Pull inputs from S3 Run the command Push outputs to S3 delete the tmp_dir """ def wrapped(fxn, *args, **kwargs): """ 1) If input starts with s3:// or s3_path is set, pull or stream 3) If s3_path is set, push outputs to S3 """ fxn_sig = funcsigs.signature(fxn) # for some reason decorator.decorator is using args instead of kwargs.. # repopulate kwargs manually. for i, (k, parameter) in enumerate(fxn_sig.parameters.items()): kwargs[k] = args[i] # HANDLE INPUTS def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr( fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % ( random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp( s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith( '.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip( *(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value( input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition( lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines) fifo_lines, s3_pull_cmds = process_input_map() # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len( # s3_push_cmds) else '' fifo_cmds = "\n".join('%s &' % l for l in fifo_lines) # HANDLE OUTPUTS def gen_pushes(): if s3_path: for output_name, output_vals in output_map.items(): if isinstance(fxn_sig.parameters[output_name].default, Forward): # do not s3 push if this is a forwarded input continue else: if not isinstance(output_vals, list): output_vals = [output_vals] for out_val in output_vals: local_path = out_val s3_push_path = opj(s3_path, out_val) if output_name in getattr( fxn, 'can_stream', []): # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path)) kwargs[output_name] = s3cmd.stream_out( opj(s3_path, out_val)) # do not push since we're streaming else: yield s3cmd.cp(out_val, s3_push_path) fifo_lines, s3_push_cmds = partition( lambda cmd: cmd.startswith('mkfifo'), filter(bool, gen_pushes())) fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds) # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len( # cp_lines) else '' fifo_cmds += "\n".join( '%s &' % l for l in fifo_lines) + "\n" if len(fifo_lines) else '' r = fxn(**kwargs) # print prepend, r, append # if 'bwa' in fxn.__name__: # raise if r == NOOP: return NOOP else: return jinja2.Template("""#!/bin/bash set -e set -o pipefail TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX` trap "rm -rf $TMP_DIR" EXIT echo "Running on host: `hostname`" echo "Created temp dir: $TMP_DIR" echo "Mount space before pull: `df -h |grep scratch`" cd $TMP_DIR {{make_output_dir}} {{s3pull}} echo "S3 Pulled data size:" `du -hs .` echo "Mount space before after pull: `df -h |grep scratch`" {{r}} {{ s3push }} """).render(tmp_dir=s['gk']['tmp_dir'], s3pull=parallel.render(cmds=s3_pull_cmds), s3push=parallel.render(cmds=s3_push_cmds), stage_name=stage_name, fifo_cmds=fifo_cmds, r=r, s3_pull_cmds=s3_pull_cmds, s3_push_cmds=s3_push_cmds, make_output_dir='mkdir -p %s\n' % task.output_dir if task.output_dir and task.output_dir != '' else '') return decorator.decorator(wrapped) return s3_cmd_fxn_wrapper
def assert_references_exist(): s = get_env().config for k,v in s['ref'].items(): assert os.path.exists(v), 'Reference file missing! %s = %s' % (k,v)