Esempio n. 1
0
    def __connect(self):
        """
        Make sure we have an S3 Bucket connection, and set one up if we don't.
        Creates the S3 bucket if it doesn't exist.
        """

        if self.s3 is None:
            RealtimeLogger.debug("Connecting to bucket {} in region".format(
                self.bucket_name, self.region))

            # Configure boto3 for caching assumed role credentials with the same cache Toil uses
            botocore_session = botocore.session.get_session()
            botocore_session.get_component('credential_provider').get_provider(
                'assume-role').cache = botocore.credentials.JSONFileCache()
            boto3_session = boto3.Session(botocore_session=botocore_session)

            # Connect to the s3 bucket service where we keep everything
            self.s3 = boto3_session.client('s3')
            try:
                self.s3.head_bucket(Bucket=self.bucket_name)
            except:
                self.s3.create_bucket(Bucket=self.bucket_name,
                                      CreateBucketConfiguration={
                                          'LocationConstraint': self.region
                                      })
Esempio n. 2
0
    def write_output_file(self, local_path, output_path):
        """
        Write output to the filesystem
        """

        RealtimeLogger.debug("Saving {} to FileIOStore in {}".format(
            output_path, self.path_prefix))

        # What's the real output path to write to?
        real_output_path = os.path.join(self.path_prefix, output_path)

        # What directory should this go in?
        parent_dir = os.path.split(real_output_path)[0]

        if parent_dir != "":
            # Make sure the directory it goes in exists.
            robust_makedirs(parent_dir)

        # Make a temporary file
        temp_handle, temp_path = tempfile.mkstemp(dir=self.path_prefix)
        os.close(temp_handle)

        # Copy to the temp file
        shutil.copy2(local_path, temp_path)

        if os.path.exists(real_output_path):
            # At least try to get existing files out of the way first.
            try:
                os.unlink(real_output_path)
            except:
                pass

        # Rename the temp file to the right place, atomically
        os.rename(temp_path, real_output_path)
Esempio n. 3
0
    def __connect(self):
        """
        Make sure we have an S3 Bucket connection, and set one up if we don't.
        Creates the S3 bucket if it doesn't exist.
        """

        if self.s3 is None:
            RealtimeLogger.debug("Connecting to bucket {} in region {}".format(
                self.bucket_name, self.region))
            print "Connecting to bucket {} in region {}".format(
                self.bucket_name, self.region)

            # Connect to the s3 bucket service where we keep everything
            self.s3 = boto3.client('s3',
                                   self.region,
                                   config=botocore.client.Config(
                                       signature_version='s3v4',
                                       retries={"max_attempts": 20}))
            self.s3r = boto3.resource('s3',
                                      self.region,
                                      config=botocore.client.Config(
                                          signature_version='s3v4',
                                          retries={"max_attempts": 20}))
            try:
                self.s3.head_bucket(Bucket=self.bucket_name)
            except:
                if self.region == 'us-east-1':
                    self.s3.create_bucket(Bucket=self.bucket_name, )
                else:
                    self.s3.create_bucket(
                        Bucket=self.bucket_name,
                        CreateBucketConfiguration={
                            'LocationConstraint': self.region
                        },
                    )
Esempio n. 4
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from the filesystem.
        """

        RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format(
            input_path, self.path_prefix, local_path))

        if os.path.exists(local_path):
            # Try deleting the existing item if it already exists
            try:
                os.unlink(local_path)
            except:
                # Don't fail here, fail complaining about the assertion, which
                # will be more informative.
                pass

        # Make sure the path is clear for copying
        assert (not os.path.exists(local_path))

        # Where is the file actually?
        real_path = os.path.abspath(os.path.join(self.path_prefix, input_path))

        if not os.path.exists(real_path):
            RealtimeLogger.error(
                "Can't find {} from FileIOStore in {}!".format(
                    input_path, self.path_prefix))
            raise RuntimeError("File {} missing!".format(real_path))

        # Make a temporary file
        temp_handle, temp_path = tempfile.mkstemp(
            dir=os.path.dirname(local_path))
        os.close(temp_handle)

        # Copy to the temp file
        shutil.copy2(real_path, temp_path)

        # Rename the temp file to the right place, atomically
        RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path))
        os.rename(temp_path, local_path)

        # Look at the file stats
        file_stats = os.stat(real_path)

        if (file_stats.st_uid == os.getuid()
                and file_stats.st_mode & stat.S_IWUSR):
            # We own this file and can write to it. We don't want the user
            # script messing it up through the symlink.

            try:
                # Clear the user write bit, so the user can't accidentally
                # clobber the file in the actual store through the symlink.
                os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR)
            except OSError:
                # If something goes wrong here (like us not having permission to
                # change permissions), ignore it.
                pass
Esempio n. 5
0
    def write_output_file(self, local_path, output_path):
        """
        Write output to S3.
        """

        self.__connect()

        RealtimeLogger.debug("Saving {} to S3IOStore".format(output_path))

        # Download the file contents.
        self.s3.upload_file(local_path, self.bucket_name,
                            os.path.join(self.name_prefix, output_path))
Esempio n. 6
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from S3.
        """

        self.__connect()

        RealtimeLogger.debug("Loading {} from S3IOStore".format(input_path))

        # Download the file contents.
        self.s3.download_file(self.bucket_name,
                              os.path.join(self.name_prefix, input_path),
                              local_path)
Esempio n. 7
0
    def read_input_file(self, input_path, local_path):
        """
        Get input from Azure.
        """

        self.__connect()

        RealtimeLogger.debug("Loading {} from AzureIOStore".format(input_path))

        # Download the blob. This is known to be synchronous, although it can
        # call a callback during the process.
        self.connection.get_blob_to_path(self.container_name,
                                         self.name_prefix + input_path,
                                         local_path)
Esempio n. 8
0
    def __connect(self):
        """
        Make sure we have an Azure connection, and set one up if we don't.
        """

        if self.connection is None:
            RealtimeLogger.debug("Connecting to account {}, using "
                                 "container {} and prefix {}".format(
                                     self.account_name, self.container_name,
                                     self.name_prefix))

            # Connect to the blob service where we keep everything
            self.connection = BlobService(account_name=self.account_name,
                                          account_key=self.account_key)
Esempio n. 9
0
    def download(filename):
        """
        Download each file
        """

        try:

            if (not options.overwrite) and out_store.exists(filename):
                # File exists. But make sure its size is correct.

                if not options.check_size:
                    # Skip existing file. No need to check the length.
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

                out_size = out_store.get_size(filename)
                in_size = in_store.get_size(filename)
                if out_size != in_size:
                    # Complain about size mismatch and copy
                    RealtimeLogger.warning(
                        "Redownloading {}! Size was {} and not {}!".format(
                            filename, out_size, in_size))
                else:
                    # Skip existing file
                    RealtimeLogger.info("Skipped {}".format(filename))
                    return

            # Make a temp file
            (handle,
             path) = tempfile.mkstemp(dir=job.fileStore.getLocalTempDir())
            os.close(handle)

            RealtimeLogger.debug("Download {}".format(filename))

            # Download
            in_store.read_input_file(filename, path)
            # Store
            out_store.write_output_file(path, filename)

            # Clean up
            os.unlink(path)

        except:
            # Put all exception text into an exception and raise that
            raise Exception("".join(
                traceback.format_exception(*sys.exc_info())))

        RealtimeLogger.info("Copied {}".format(filename))
Esempio n. 10
0
    def write_output_file(self, local_path, output_path):
        """
        Write output to Azure. Will create the container if necessary.
        """

        self.__connect()

        RealtimeLogger.debug("Saving {} to AzureIOStore".format(output_path))

        try:
            # Make the container
            self.connection.create_container(self.container_name)
        except azure.WindowsAzureConflictError:
            # The container probably already exists
            pass

        # Upload the blob (synchronously)
        # TODO: catch no container error here, make the container, and retry
        self.connection.put_block_blob_from_path(
            self.container_name, self.name_prefix + output_path, local_path)
Esempio n. 11
0
 def run(self, fileStore):
     RealtimeLogger.info('This should be logged at info level')
     RealtimeLogger.debug('This should be logged at debug level')
Esempio n. 12
0
    def call_with_docker(self, job, args, work_dir, outfile, errfile,
                         check_output, tool_name):
        """
        
        Thin wrapper for docker_call that will use internal lookup to
        figure out the location of the docker file.  Only exposes docker_call
        parameters used so far.  expect args as list of lists.  if (toplevel)
        list has size > 1, then piping interface used
        
        Does support redirecting output to outfile, unless check_output is
        used, in which case output is captured.
        
        """

        RealtimeLogger.info(
            truncate_msg("Docker Run: {}".format(" | ".join(" ".join(x)
                                                            for x in args))))
        start_time = timeit.default_timer()

        # we use the first argument to look up the tool in the docker map
        # but allow overriding of this with the tool_name parameter
        name = tool_name if tool_name is not None else args[0][0]
        tool = self.docker_tool_map[name]

        # We keep an environment dict
        environment = {}

        # And an entry point override
        entrypoint = None

        # And a volumes dict for mounting
        volumes = {}

        # And a working directory override
        working_dir = None

        # breaks Rscript.  Todo: investigate how general this actually is
        if name != 'Rscript':
            # vg uses TMPDIR for temporary files
            # this is particularly important for gcsa, which makes massive files.
            # we will default to keeping these in our working directory
            environment['TMPDIR'] = '.'

        if name == 'Rscript':
            # The R dockers by default want to install packages in non-writable directories. Sometimes.
            # Make sure a writable directory which exists is used.
            environment['R_LIBS'] = '/tmp'

        if name == 'vg':
            environment['VG_FULL_TRACEBACK'] = '1'

        # ugly hack for platypus, as default container doesn't have executable in path
        if tool == 'quay.io/biocontainers/platypus-variant:0.8.1.1--htslib1.7_1' and \
           args[0][0] == 'Platypus.py':
            args[0][
                0] = '/usr/local/share/platypus-variant-0.8.1.1-1/Platypus.py'

        # Force all dockers to run sort in a consistent way
        environment['LC_ALL'] = 'C'

        # set our working directory map
        if work_dir is not None:
            volumes[os.path.abspath(work_dir)] = {
                'bind': '/data',
                'mode': 'rw'
            }
            working_dir = '/data'

        if outfile is not None:
            # We need to send output to a file object

            assert (not check_output)

            # We can't just redirect stdout of the container from the API, so
            # we do something more complicated.

            # Now we need to populate an FD that spits out the container output.
            output_fd = None

            # We may be able to use a FIFO, or we may need a network connection.
            # FIFO sharing between host and container only works on Linux.
            use_fifo = (platform.system() == 'Linux')

            if use_fifo:
                # On a Linux host we can just use a FIFO from the container to the host

                # Set up a FIFO to receive it
                fifo_dir = tempfile.mkdtemp()
                fifo_host_path = os.path.join(fifo_dir, 'stdout.fifo')
                os.mkfifo(fifo_host_path)

                # Mount the FIFO in the container.
                # The container doesn't actually have to have the mountpoint directory in its filesystem.
                volumes[fifo_dir] = {'bind': '/control', 'mode': 'rw'}

                # Redirect the command output by tacking on another pipeline stage
                parameters = args + [['dd', 'of=/control/stdout.fifo']]

                # Open the FIFO into nonblocking mode. See
                # <https://stackoverflow.com/a/5749687> and
                # <http://shallowsky.com/blog/programming/python-read-characters.html>
                output_fd = os.open(fifo_host_path,
                                    os.O_RDONLY | os.O_NONBLOCK)

            else:
                # On a Mac host we can't because of https://github.com/docker/for-mac/issues/483
                # We need to go over the network instead.

                # Open an IPv4 TCP socket, since we know Docker uses IPv4 only
                listen_sock = socket.socket(socket.AF_INET)
                # Bind it to an OS-selected port on all interfaces, since we can't determine the Docker interface
                # TODO: socket.INADDR_ANY ought to work here but is rejected for being an int.
                listen_sock.bind(('', 0))

                # Start listening
                listen_sock.listen(1)

                # Get the port we got given
                listen_port = listen_sock.getsockname()[1]

                # Generate a random security cookie. Since we can't really stop
                # Internet randos from connecting to our socket, we bail out on
                # any connection that doesn't start with this cookie and a newline.
                security_cookie = str(uuid.uuid4())

                # Redirect the command output to that port using Bash networking
                # Your Docker needs to be 18.03+ to support host.docker.internal
                # Your container needs to have bash with networking support
                parameters = args + [[
                    'bash', '-c',
                    'exec 3<>/dev/tcp/host.docker.internal/{}; cat <(echo {}) - >&3'
                    .format(listen_port, security_cookie)
                ]]

                RealtimeLogger.debug(
                    "Listening on port {} for output from Docker container".
                    format(listen_port))

                # We can't populate the FD until we accept, which we can't do
                # until the Docker comes up and is trying to connect.

            RealtimeLogger.debug("Final Docker command: {}".format(" | ".join(
                " ".join(x) for x in parameters)))

            # Start the container detached so we don't wait on it
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            RealtimeLogger.debug("Asked for container {}".format(container.id))

            if not use_fifo:
                # Try and accept a connection from the container.
                # Make sure there's a timeout so we don't accept forever
                listen_sock.settimeout(10)

                for attempt in range(3):

                    connection_sock, remote_address = listen_sock.accept()

                    RealtimeLogger.info(
                        "Got connection from {}".format(remote_address))

                    # Set a 10 second timeout for the cookie
                    connection_sock.settimeout(10)

                    # Check the security cookie
                    received_cookie_and_newline = connection_sock.recv(
                        len(security_cookie) + 1)

                    if received_cookie_and_newline != security_cookie + "\n":
                        # Incorrect security cookie.
                        RealtimeLogger.warning(
                            "Received incorect security cookie message from {}"
                            .format(remote_address))
                        continue
                    else:
                        # This is the container we are looking for
                        # Go into nonblocking mode which our read code expects
                        connection_sock.setblocking(True)
                        # Set the FD
                        output_fd = connection_sock.fileno()
                        break

                if output_fd is None:
                    # We can't get ahold of the Docker in time
                    raise RuntimeError(
                        "Could not establish network connection for Docker output!"
                    )

            # If the Docker container goes badly enough, it may not even open
            # the other end of the connection. So we can't just wait for it to
            # EOF before checking on the Docker.

            # Now read ought to throw if there is no data. But
            # <https://stackoverflow.com/q/38843278> and some testing suggest
            # that this doesn't happen, and it just looks like EOF. So we will
            # watch out for that.

            try:
                # Prevent leaking FDs

                # If this is set, and there is no data in the pipe, decide that no data is coming
                last_chance = False
                # If this is set, we have seen data in the pipe, so the other
                # end must have opened it and will eventually close it if it
                # doesn't run forever.
                saw_data = False

                while True:
                    # While there still might be data in the pipe

                    if output_fd is not None:
                        # Select on the pipe with a timeout, so we don't spin constantly waiting for data
                        can_read, can_write, had_error = select.select(
                            [output_fd], [], [output_fd], 10)

                    if len(can_read) > 0 or len(had_error) > 0:
                        # There is data available or something else weird about our FIFO.

                        try:
                            # Do a nonblocking read. Since we checked with select we never should get "" unless there's an EOF.
                            data = os.read(output_fd, 4096)

                            if data == "":
                                # We didn't throw and we got nothing, so it must be EOF.
                                RealtimeLogger.debug("Got EOF")
                                break

                        except OSError as err:
                            if err.errno in [errno.EAGAIN, errno.EWOULDBLOCK]:
                                # There is no data right now
                                data = None
                            else:
                                # Something else has gone wrong
                                raise err

                    else:
                        # There is no data available. Don't even try to read. Treat it as if a read refused to block.
                        data = None

                    if data is not None:
                        # Send our data to the outfile
                        outfile.write(data)
                        saw_data = True
                    elif not saw_data:
                        # We timed out and there has never been any data. Maybe the container has died/never started?

                        if last_chance:
                            # The container has been dead for a while and nothing has arrived yet. Assume no data is coming.
                            RealtimeLogger.warning(
                                "Giving up on output form container {}".format(
                                    container.id))
                            break

                        # Otherwise, check on it
                        container.reload()

                        if container.status not in [
                                'created', 'restarting', 'running', 'removing'
                        ]:
                            # The container has stopped. So what are we doing waiting around for it?

                            # Wait one last time for any lingering data to percolate through the FIFO
                            time.sleep(10)
                            last_chance = True
                            continue

            finally:
                # No matter what happens, close our end of the connection
                os.close(output_fd)

                if not use_fifo:
                    # Also close the listening socket
                    listen_sock.close()

            # Now our data is all sent.
            # Wait on the container and get its return code.
            return_code = container.wait()

            if use_fifo:
                # Clean up the FIFO files
                os.unlink(fifo_host_path)
                os.rmdir(fifo_dir)

        else:
            # No piping needed.

            if len(args) == 1:
                # split off first argument as entrypoint (so we can be oblivious as to whether
                # that happens by default)
                parameters = [] if len(args[0]) == 1 else args[0][1:]
                entrypoint = args[0][0]
            else:
                # can leave as is for piped interface which takes list of args lists
                # and doesn't worry about entrypoints since everything goes through bash -c
                # todo: check we have a bash entrypoint!
                parameters = args

            # Run the container and dump the logs if it fails.
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            # Wait on the container and get its return code.
            return_code = container.wait()

        # When we get here, the container has been run, and stdout is either in the file object we sent it to or in the Docker logs.
        # stderr is always in the Docker logs.

        if return_code != 0:
            # What were we doing?
            command = " | ".join(" ".join(x) for x in args)

            # Dump logs
            RealtimeLogger.error(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
            RealtimeLogger.error("Dumping stderr...")
            for line in container.logs(stderr=True, stdout=False, stream=True):
                # Trim trailing \n
                RealtimeLogger.error(line[:-1])

            if not check_output and outfile is None:
                # Dump stdout as well, since it's not something the caller wanted as data
                RealtimeLogger.error("Dumping stdout...")
                for line in container.logs(stderr=False,
                                           stdout=True,
                                           stream=True):
                    # Trim trailing \n
                    RealtimeLogger.error(line[:-1])

            # Raise an error if it's not sucess
            raise RuntimeError(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
        elif errfile:
            # user wants stderr even if no crash
            for line in container.logs(stderr=True, stdout=False, stream=True):
                errfile.write(line)

        if check_output:
            # We need to collect the output. We grab it from Docker's handy on-disk buffer.
            # TODO: Bad Things can happen if the container logs too much.
            captured_stdout = container.logs(stderr=False, stdout=True)

        end_time = timeit.default_timer()
        run_time = end_time - start_time
        RealtimeLogger.info("Successfully docker ran {} in {} seconds.".format(
            " | ".join(" ".join(x) for x in args), run_time))

        if outfile:
            outfile.flush()
            os.fsync(outfile.fileno())

        if check_output is True:
            return captured_stdout
Esempio n. 13
0
 def run(self, fileStore):
     RealtimeLogger.info('This should be logged at info level')
     RealtimeLogger.debug('This should be logged at debug level')