def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr( fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % ( random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp( s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith( '.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip( *(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value( input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition( lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines)
def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr(fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % (random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp(s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith('.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip(*(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value(input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines)
def wrapped(fxn, *args, **kwargs): """ 1) If input starts with s3:// or s3_path is set, pull or stream 3) If s3_path is set, push outputs to S3 """ fxn_sig = funcsigs.signature(fxn) # for some reason decorator.decorator is using args instead of kwargs.. # repopulate kwargs manually. for i, (k, parameter) in enumerate(fxn_sig.parameters.items()): kwargs[k] = args[i] # HANDLE INPUTS def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr(fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % (random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp(s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith('.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip(*(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value(input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines) fifo_lines, s3_pull_cmds = process_input_map() # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len( # s3_push_cmds) else '' fifo_cmds = "\n".join('%s &' % l for l in fifo_lines) # HANDLE OUTPUTS def gen_pushes(): if s3_path: for output_name, output_vals in output_map.items(): if isinstance(fxn_sig.parameters[output_name].default, Forward): # do not s3 push if this is a forwarded input continue else: if not isinstance(output_vals, list): output_vals = [output_vals] for out_val in output_vals: local_path = out_val s3_push_path = opj(s3_path, out_val) if output_name in getattr(fxn, 'can_stream', []): # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path)) kwargs[output_name] = s3cmd.stream_out(opj(s3_path, out_val)) # do not push since we're streaming else: yield s3cmd.cp(out_val, s3_push_path) fifo_lines, s3_push_cmds = partition(lambda cmd: cmd.startswith('mkfifo'), filter(bool, gen_pushes())) fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds) # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len( # cp_lines) else '' fifo_cmds += "\n".join('%s &' % l for l in fifo_lines) + "\n" if len(fifo_lines) else '' r = fxn(**kwargs) # print prepend, r, append # if 'bwa' in fxn.__name__: # raise if r == NOOP: return NOOP else: return jinja2.Template("""#!/bin/bash set -e set -o pipefail TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX` trap "rm -rf $TMP_DIR" EXIT echo "Running on host: `hostname`" echo "Created temp dir: $TMP_DIR" echo "Mount space before pull: `df -h |grep scratch`" cd $TMP_DIR {{make_output_dir}} {{s3pull}} echo "S3 Pulled data size:" `du -hs .` echo "Mount space before after pull: `df -h |grep scratch`" {{r}} {{ s3push }} """).render(tmp_dir=s['gk']['tmp_dir'], s3pull=parallel.render(cmds=s3_pull_cmds), s3push=parallel.render(cmds=s3_push_cmds), stage_name=stage_name, fifo_cmds=fifo_cmds, r=r, s3_pull_cmds=s3_pull_cmds, s3_push_cmds=s3_push_cmds, make_output_dir='mkdir -p %s\n' % task.output_dir if task.output_dir and task.output_dir != '' else '')
def wrapped(fxn, *args, **kwargs): """ 1) If input starts with s3:// or s3_path is set, pull or stream 3) If s3_path is set, push outputs to S3 """ fxn_sig = funcsigs.signature(fxn) # for some reason decorator.decorator is using args instead of kwargs.. # repopulate kwargs manually. for i, (k, parameter) in enumerate(fxn_sig.parameters.items()): kwargs[k] = args[i] # HANDLE INPUTS def process_input_map(): # TODO this function should probably be refactored for readability def process_input_value(input_param_name, file_path): """ :returns: s3_copy_command, new_input_value """ can_stream = input_param_name in getattr( fxn, 'can_stream', []) if file_path.startswith('s3://'): local_path = 'tmp-%s__%s' % ( random_str(6), os.path.basename(file_path)) s3_pull_path = file_path else: local_path = file_path s3_pull_path = opj(s3_path, file_path) if can_stream: return None, s3cmd.stream_in(s3_pull_path) # return 'mkfifo {fifo_path} && {cp} > {fifo_path}'.format(fifo_path=local_path, cp=s3cmd.cp(s3_pull_path, '-')), local_path else: # pull to cwd as a s3_pull_path file (it'll get deleted when $TMP_DIR is deleted) return s3cmd.cp( s3_pull_path, local_path, chrom=task.tags.get('contig')), local_path s3_pull_all_inputs = [] def skip_bai(input_value): # since we are going to be slicing, we do not want the bai return task.tags.get('contig') and input_value.endswith( '.bai') for input_name, input_value in input_map.items(): if isinstance(input_value, list): if skip_bai(input_value[0]): continue cp_cmds, new_input_value_list = zip( *(process_input_value(input_name, iv) for iv in input_value)) s3_pull_all_inputs += cp_cmds kwargs[input_name] = new_input_value_list else: if skip_bai(input_value): continue cp_cmd, new_input_value = process_input_value( input_name, input_value) s3_pull_all_inputs.append(cp_cmd) kwargs[input_name] = new_input_value fifo_lines, pull_lines = partition( lambda cmd: cmd.startswith('mkfifo'), filter(bool, s3_pull_all_inputs)) return list(fifo_lines), list(pull_lines) fifo_lines, s3_pull_cmds = process_input_map() # s3_pull_all_inputs_cmd = "\n".join('/usr/bin/time -f "s3 pull #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(s3_push_cmds)) + '\nwait' if len( # s3_push_cmds) else '' fifo_cmds = "\n".join('%s &' % l for l in fifo_lines) # HANDLE OUTPUTS def gen_pushes(): if s3_path: for output_name, output_vals in output_map.items(): if isinstance(fxn_sig.parameters[output_name].default, Forward): # do not s3 push if this is a forwarded input continue else: if not isinstance(output_vals, list): output_vals = [output_vals] for out_val in output_vals: local_path = out_val s3_push_path = opj(s3_path, out_val) if output_name in getattr( fxn, 'can_stream', []): # yield 'mkfifo {fifo_path} && cat {fifo_path} | {cp}'.format(fifo_path=local_path, cp=s3cmd.cp('-', s3_push_path)) kwargs[output_name] = s3cmd.stream_out( opj(s3_path, out_val)) # do not push since we're streaming else: yield s3cmd.cp(out_val, s3_push_path) fifo_lines, s3_push_cmds = partition( lambda cmd: cmd.startswith('mkfifo'), filter(bool, gen_pushes())) fifo_lines, s3_push_cmds = list(fifo_lines), list(s3_push_cmds) # s3_push_all_outputs = "\n".join('/usr/bin/time -f "s3 push #{0} %E" {1} 2>&1 &'.format(i, l) for i, l in enumerate(cp_lines)) + '\nwait' if len( # cp_lines) else '' fifo_cmds += "\n".join( '%s &' % l for l in fifo_lines) + "\n" if len(fifo_lines) else '' r = fxn(**kwargs) # print prepend, r, append # if 'bwa' in fxn.__name__: # raise if r == NOOP: return NOOP else: return jinja2.Template("""#!/bin/bash set -e set -o pipefail TMP_DIR=`mktemp -d --tmpdir={{tmp_dir}} {{stage_name}}_XXXXXXXXX` trap "rm -rf $TMP_DIR" EXIT echo "Running on host: `hostname`" echo "Created temp dir: $TMP_DIR" echo "Mount space before pull: `df -h |grep scratch`" cd $TMP_DIR {{make_output_dir}} {{s3pull}} echo "S3 Pulled data size:" `du -hs .` echo "Mount space before after pull: `df -h |grep scratch`" {{r}} {{ s3push }} """).render(tmp_dir=s['gk']['tmp_dir'], s3pull=parallel.render(cmds=s3_pull_cmds), s3push=parallel.render(cmds=s3_push_cmds), stage_name=stage_name, fifo_cmds=fifo_cmds, r=r, s3_pull_cmds=s3_pull_cmds, s3_push_cmds=s3_push_cmds, make_output_dir='mkdir -p %s\n' % task.output_dir if task.output_dir and task.output_dir != '' else '')