def download(filename): """ Download each file """ try: if (not options.overwrite) and out_store.exists(filename): # File exists. But make sure its size is correct. if not options.check_size: # Skip existing file. No need to check the length. RealtimeLogger.info("Skipped {}".format(filename)) return out_size = out_store.get_size(filename) in_size = in_store.get_size(filename) if out_size != in_size: # Complain about size mismatch and copy RealtimeLogger.warning( "Redownloading {}! Size was {} and not {}!".format( filename, out_size, in_size)) else: # Skip existing file RealtimeLogger.info("Skipped {}".format(filename)) return # Make a temp file (handle, path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir()) os.close(handle) RealtimeLogger.debug("Download {}".format(filename)) # Download in_store.read_input_file(filename, path) # Store out_store.write_output_file(path, filename) # Clean up os.unlink(path) except: # Put all exception text into an exception and raise that raise Exception("".join( traceback.format_exception(*sys.exc_info()))) RealtimeLogger.info("Copied {}".format(filename))
def call_with_docker(self, job, args, work_dir, outfile, errfile, check_output, tool_name): """ Thin wrapper for docker_call that will use internal lookup to figure out the location of the docker file. Only exposes docker_call parameters used so far. expect args as list of lists. if (toplevel) list has size > 1, then piping interface used Does support redirecting output to outfile, unless check_output is used, in which case output is captured. """ RealtimeLogger.info( truncate_msg("Docker Run: {}".format(" | ".join(" ".join(x) for x in args)))) start_time = timeit.default_timer() # we use the first argument to look up the tool in the docker map # but allow overriding of this with the tool_name parameter name = tool_name if tool_name is not None else args[0][0] tool = self.docker_tool_map[name] # We keep an environment dict environment = {} # And an entry point override entrypoint = None # And a volumes dict for mounting volumes = {} # And a working directory override working_dir = None # breaks Rscript. Todo: investigate how general this actually is if name != 'Rscript': # vg uses TMPDIR for temporary files # this is particularly important for gcsa, which makes massive files. # we will default to keeping these in our working directory environment['TMPDIR'] = '.' if name == 'Rscript': # The R dockers by default want to install packages in non-writable directories. Sometimes. # Make sure a writable directory which exists is used. environment['R_LIBS'] = '/tmp' if name == 'vg': environment['VG_FULL_TRACEBACK'] = '1' # ugly hack for platypus, as default container doesn't have executable in path if tool == 'quay.io/biocontainers/platypus-variant:0.8.1.1--htslib1.7_1' and \ args[0][0] == 'Platypus.py': args[0][ 0] = '/usr/local/share/platypus-variant-0.8.1.1-1/Platypus.py' # Force all dockers to run sort in a consistent way environment['LC_ALL'] = 'C' # set our working directory map if work_dir is not None: volumes[os.path.abspath(work_dir)] = { 'bind': '/data', 'mode': 'rw' } working_dir = '/data' if outfile is not None: # We need to send output to a file object assert (not check_output) # We can't just redirect stdout of the container from the API, so # we do something more complicated. # Now we need to populate an FD that spits out the container output. output_fd = None # We may be able to use a FIFO, or we may need a network connection. # FIFO sharing between host and container only works on Linux. use_fifo = (platform.system() == 'Linux') if use_fifo: # On a Linux host we can just use a FIFO from the container to the host # Set up a FIFO to receive it fifo_dir = tempfile.mkdtemp() fifo_host_path = os.path.join(fifo_dir, 'stdout.fifo') os.mkfifo(fifo_host_path) # Mount the FIFO in the container. # The container doesn't actually have to have the mountpoint directory in its filesystem. volumes[fifo_dir] = {'bind': '/control', 'mode': 'rw'} # Redirect the command output by tacking on another pipeline stage parameters = args + [['dd', 'of=/control/stdout.fifo']] # Open the FIFO into nonblocking mode. See # <https://stackoverflow.com/a/5749687> and # <http://shallowsky.com/blog/programming/python-read-characters.html> output_fd = os.open(fifo_host_path, os.O_RDONLY | os.O_NONBLOCK) else: # On a Mac host we can't because of https://github.com/docker/for-mac/issues/483 # We need to go over the network instead. # Open an IPv4 TCP socket, since we know Docker uses IPv4 only listen_sock = socket.socket(socket.AF_INET) # Bind it to an OS-selected port on all interfaces, since we can't determine the Docker interface # TODO: socket.INADDR_ANY ought to work here but is rejected for being an int. listen_sock.bind(('', 0)) # Start listening listen_sock.listen(1) # Get the port we got given listen_port = listen_sock.getsockname()[1] # Generate a random security cookie. Since we can't really stop # Internet randos from connecting to our socket, we bail out on # any connection that doesn't start with this cookie and a newline. security_cookie = str(uuid.uuid4()) # Redirect the command output to that port using Bash networking # Your Docker needs to be 18.03+ to support host.docker.internal # Your container needs to have bash with networking support parameters = args + [[ 'bash', '-c', 'exec 3<>/dev/tcp/host.docker.internal/{}; cat <(echo {}) - >&3' .format(listen_port, security_cookie) ]] RealtimeLogger.debug( "Listening on port {} for output from Docker container". format(listen_port)) # We can't populate the FD until we accept, which we can't do # until the Docker comes up and is trying to connect. RealtimeLogger.debug("Final Docker command: {}".format(" | ".join( " ".join(x) for x in parameters))) # Start the container detached so we don't wait on it container = apiDockerCall(job, tool, parameters, volumes=volumes, working_dir=working_dir, entrypoint=entrypoint, environment=environment, detach=True) RealtimeLogger.debug("Asked for container {}".format(container.id)) if not use_fifo: # Try and accept a connection from the container. # Make sure there's a timeout so we don't accept forever listen_sock.settimeout(10) for attempt in range(3): connection_sock, remote_address = listen_sock.accept() RealtimeLogger.info( "Got connection from {}".format(remote_address)) # Set a 10 second timeout for the cookie connection_sock.settimeout(10) # Check the security cookie received_cookie_and_newline = connection_sock.recv( len(security_cookie) + 1) if received_cookie_and_newline != security_cookie + "\n": # Incorrect security cookie. RealtimeLogger.warning( "Received incorect security cookie message from {}" .format(remote_address)) continue else: # This is the container we are looking for # Go into nonblocking mode which our read code expects connection_sock.setblocking(True) # Set the FD output_fd = connection_sock.fileno() break if output_fd is None: # We can't get ahold of the Docker in time raise RuntimeError( "Could not establish network connection for Docker output!" ) # If the Docker container goes badly enough, it may not even open # the other end of the connection. So we can't just wait for it to # EOF before checking on the Docker. # Now read ought to throw if there is no data. But # <https://stackoverflow.com/q/38843278> and some testing suggest # that this doesn't happen, and it just looks like EOF. So we will # watch out for that. try: # Prevent leaking FDs # If this is set, and there is no data in the pipe, decide that no data is coming last_chance = False # If this is set, we have seen data in the pipe, so the other # end must have opened it and will eventually close it if it # doesn't run forever. saw_data = False while True: # While there still might be data in the pipe if output_fd is not None: # Select on the pipe with a timeout, so we don't spin constantly waiting for data can_read, can_write, had_error = select.select( [output_fd], [], [output_fd], 10) if len(can_read) > 0 or len(had_error) > 0: # There is data available or something else weird about our FIFO. try: # Do a nonblocking read. Since we checked with select we never should get "" unless there's an EOF. data = os.read(output_fd, 4096) if data == "": # We didn't throw and we got nothing, so it must be EOF. RealtimeLogger.debug("Got EOF") break except OSError as err: if err.errno in [errno.EAGAIN, errno.EWOULDBLOCK]: # There is no data right now data = None else: # Something else has gone wrong raise err else: # There is no data available. Don't even try to read. Treat it as if a read refused to block. data = None if data is not None: # Send our data to the outfile outfile.write(data) saw_data = True elif not saw_data: # We timed out and there has never been any data. Maybe the container has died/never started? if last_chance: # The container has been dead for a while and nothing has arrived yet. Assume no data is coming. RealtimeLogger.warning( "Giving up on output form container {}".format( container.id)) break # Otherwise, check on it container.reload() if container.status not in [ 'created', 'restarting', 'running', 'removing' ]: # The container has stopped. So what are we doing waiting around for it? # Wait one last time for any lingering data to percolate through the FIFO time.sleep(10) last_chance = True continue finally: # No matter what happens, close our end of the connection os.close(output_fd) if not use_fifo: # Also close the listening socket listen_sock.close() # Now our data is all sent. # Wait on the container and get its return code. return_code = container.wait() if use_fifo: # Clean up the FIFO files os.unlink(fifo_host_path) os.rmdir(fifo_dir) else: # No piping needed. if len(args) == 1: # split off first argument as entrypoint (so we can be oblivious as to whether # that happens by default) parameters = [] if len(args[0]) == 1 else args[0][1:] entrypoint = args[0][0] else: # can leave as is for piped interface which takes list of args lists # and doesn't worry about entrypoints since everything goes through bash -c # todo: check we have a bash entrypoint! parameters = args # Run the container and dump the logs if it fails. container = apiDockerCall(job, tool, parameters, volumes=volumes, working_dir=working_dir, entrypoint=entrypoint, environment=environment, detach=True) # Wait on the container and get its return code. return_code = container.wait() # When we get here, the container has been run, and stdout is either in the file object we sent it to or in the Docker logs. # stderr is always in the Docker logs. if return_code != 0: # What were we doing? command = " | ".join(" ".join(x) for x in args) # Dump logs RealtimeLogger.error( "Docker container for command {} failed with code {}".format( command, return_code)) RealtimeLogger.error("Dumping stderr...") for line in container.logs(stderr=True, stdout=False, stream=True): # Trim trailing \n RealtimeLogger.error(line[:-1]) if not check_output and outfile is None: # Dump stdout as well, since it's not something the caller wanted as data RealtimeLogger.error("Dumping stdout...") for line in container.logs(stderr=False, stdout=True, stream=True): # Trim trailing \n RealtimeLogger.error(line[:-1]) # Raise an error if it's not sucess raise RuntimeError( "Docker container for command {} failed with code {}".format( command, return_code)) elif errfile: # user wants stderr even if no crash for line in container.logs(stderr=True, stdout=False, stream=True): errfile.write(line) if check_output: # We need to collect the output. We grab it from Docker's handy on-disk buffer. # TODO: Bad Things can happen if the container logs too much. captured_stdout = container.logs(stderr=False, stdout=True) end_time = timeit.default_timer() run_time = end_time - start_time RealtimeLogger.info("Successfully docker ran {} in {} seconds.".format( " | ".join(" ".join(x) for x in args), run_time)) if outfile: outfile.flush() os.fsync(outfile.fileno()) if check_output is True: return captured_stdout
def run_chunk_alignment(job, context, gam_input_reads, bam_input_reads, sample_name, interleaved, mapper, chunk_filename_ids, chunk_id, indexes, bam_output=False, gbwt_penalty=None, always_check_population=True, validate=False, fasta_dict_id=None): """ Align a chunk of reads. Takes a dict from index type to index file ID. Some indexes are extra and specifying them will change mapping behavior. """ RealtimeLogger.info("Starting {} alignment on {} chunk {}".format(mapper, sample_name, chunk_id)) # How long did the alignment take to run, in seconds? run_time = None # Define work directory for docker calls work_dir = job.fileStore.getLocalTempDir() # Download local input files from the remote storage container graph_file = os.path.join(work_dir, "graph.vg") # Work out what index files we need index_files = {} index_files['xg'] = graph_file + ".xg" if mapper == 'map' or mapper == 'mpmap': index_files['gcsa'] = graph_file + ".gcsa" index_files['lcp'] = index_files['gcsa'] + ".lcp" if 'gbwt' in indexes: # We have a GBWT haplotype index available. index_files['gbwt'] = graph_file + ".gbwt" if mapper == 'mpmap': if 'snarls' in indexes: # mpmap knows how to use the snarls, and we have them, so we should use them # Note that passing them will affect mapping, if using multiple # tracebacks. Since we only run single path mode, if multiple # tracebacks aren't used, mpmap will ignore the snarls. index_files['snarls'] = graph_file + ".snarls" if mapper == 'giraffe': index_files['minimizer'] = graph_file + ".min" index_files['distance'] = graph_file + ".dist" index_files['gbwt'] = graph_file + ".gbwt" if 'ggbwt' in indexes: index_files['ggbwt'] = graph_file + ".gg" for index_type in list(index_files.keys()): # Download each index file job.fileStore.readGlobalFile(indexes[index_type], index_files[index_type]) # We need the sample reads (fastq(s) or gam) for alignment reads_files = [] reads_ext = 'gam' if gam_input_reads else 'bam' if bam_input_reads else 'fq.gz' for j, chunk_filename_id in enumerate(chunk_filename_ids): reads_file = os.path.join(work_dir, 'reads_chunk_{}_{}.{}'.format(chunk_id, j, reads_ext)) job.fileStore.readGlobalFile(chunk_filename_id, reads_file) reads_files.append(reads_file) # And a temp file for our aligner output if bam_output is False: output_file = os.path.join(work_dir, "{}_{}.gam".format(sample_name, chunk_id)) else: output_file = os.path.join(work_dir, "{}_{}.bam".format(sample_name, chunk_id)) # Open the file stream for writing with open(output_file, 'wb') as alignment_file: # Start the aligner and have it write to the file # Plan out what to run vg_parts = [] if mapper == 'mpmap': vg_parts += ['vg', 'mpmap'] vg_parts += context.config.mpmap_opts if ('-F' not in vg_parts and '--output-fmt' not in vg_parts) or 'GAM' not in vg_parts: RealtimeLogger.warning('Adding --output-fmt GAM to mpmap options as only GAM output supported') vg_parts += ['--output-fmt', 'GAM'] elif mapper == 'map': vg_parts += ['vg', 'map'] vg_parts += context.config.map_opts elif mapper == 'giraffe': vg_parts += ['vg', 'giraffe'] vg_parts += context.config.giraffe_opts else: raise RuntimeError('Unimplemented mapper "{}"'.format(mapper)) for reads_file in reads_files: input_flag = '-G' if gam_input_reads else '-b' if bam_input_reads else '-f' vg_parts += [input_flag, os.path.basename(reads_file)] vg_parts += ['-t', str(context.config.alignment_cores)] vg_parts += ['-R', 'SM:{}'.format(sample_name)] # Override the -i flag in args with the --interleaved command-line flag if interleaved is True and '-i' not in vg_parts and '--interleaved' not in vg_parts: vg_parts += ['-i'] elif interleaved is False and 'i' in vg_parts: del vg_parts[vg_parts.index('-i')] if interleaved is False and '--interleaved' in vg_parts: del vg_parts[vg_parts.index('--interleaved')] # Override the --surject-to option if bam_output is True and '--surject-to' not in vg_parts and mapper != 'giraffe': vg_parts += ['--surject-to', 'bam'] elif bam_output is True and '--output-format' not in vg_parts and mapper == 'giraffe': vg_parts += ['--output-format', 'BAM'] elif bam_output is False and '--surject-to' in vg_parts: sidx = vg_parts.index('--surject-to') del vg_parts[sidx] del vg_parts[sidx] # Turn indexes into options type_to_option = { 'gbwt': '--gbwt-name', 'xg': '-x', 'gcsa': '-g', 'lcp': None, 'distance': '-d', 'minimizer': '-m', 'ggbwt': '--graph-name', 'snarls': '--snarls' } for index_type, index_file in list(index_files.items()): if type_to_option[index_type] is not None: vg_parts += [type_to_option[index_type], os.path.basename(index_file)] if 'gbwt' in index_files: # We may have a GBWT recombination rate/penalty override if gbwt_penalty is not None: # We have a recombination penalty value to apply if '--recombination-penalty' in vg_parts: # Make sure to strip out the penalty if it is in args already sidx = vg_parts.index('--recombination-penalty') del vg_parts[sidx] del vg_parts[sidx] # Both map and mpmap take this option vg_parts += ['--recombination-penalty', str(gbwt_penalty)] if mapper == 'mpmap' and always_check_population: # Always try to population-score even unambiguous reads # mpmap can do this vg_parts += ['--always-check-population'] if fasta_dict_id is not None and bam_output is True: fasta_dict_file = os.path.join(work_dir, 'fasta.dict') job.fileStore.readGlobalFile(fasta_dict_id, fasta_dict_file) vg_parts += ['--ref-paths', os.path.basename(fasta_dict_file)] RealtimeLogger.info( "Running VG for {} against {}: {}".format(sample_name, graph_file, " ".join(vg_parts))) # Mark when we start the alignment start_time = timeit.default_timer() command = vg_parts try: context.runner.call(job, command, work_dir = work_dir, outfile=alignment_file) end_time = timeit.default_timer() if validate: alignment_file.flush() context.runner.call(job, ['vg', 'validate', os.path.basename(index_files['xg']), '--gam', os.path.basename(output_file)], work_dir = work_dir) except: # Dump everything we need to replicate the alignment end_time = timeit.default_timer() logging.error("Mapping failed. Dumping files.") for index_file in list(index_files.values()): context.write_output_file(job, index_file) for reads_file in reads_files: context.write_output_file(job, reads_file) raise # Mark when it's done run_time = end_time - start_time paired_end = '-i' in vg_parts or '--interleaved' in vg_parts or len(chunk_filename_ids) > 1 RealtimeLogger.info("Aligned {}. Process took {} seconds with {} vg-{}".format( output_file, run_time, 'paired-end' if paired_end else 'single-end', mapper)) if 'id_ranges' in indexes and bam_output is False: # Break GAM into multiple chunks at the end. So we need the file # defining those chunks. id_ranges_file = os.path.join(work_dir, 'id_ranges.tsv') job.fileStore.readGlobalFile(indexes['id_ranges'], id_ranges_file) # Chunk the gam up by chromosome gam_chunks = split_gam_into_chroms(job, work_dir, context, index_files['xg'], id_ranges_file, output_file) # Write gam_chunks to store gam_chunk_ids = [] for gam_chunk in gam_chunks: gam_chunk_ids.append(context.write_intermediate_file(job, gam_chunk)) return gam_chunk_ids, run_time else: # We can just report one chunk of everything return [context.write_intermediate_file(job, output_file)], run_time