コード例 #1
0
ファイル: iostore.py プロジェクト: sailfish009/molmimic
    def new_function(*args, **kwargs):
        last_error = None
        # Call backoff times, overriding parameters with stuff from kwargs
        for delay in backoff_times(retries=kwargs.get("retries", retries),
                                   base_delay=kwargs.get(
                                       "base_delay", base_delay)):
            # Keep looping until it works or our iterator raises a
            # BackoffError
            if delay > 0:
                # We have to wait before trying again
                RealtimeLogger.error("Retry after {} seconds".format(delay))
                time.sleep(delay)
            try:
                return original_function(*args, **kwargs)
            except Exception as last_error:
                # Report the formatted underlying exception with traceback
                RealtimeLogger.error("{} failed due to: {}".format(
                    original_function.__name__,
                    "".join(traceback.format_exception(*sys.exc_info()))))

        # If we get here, the function we're calling never ran through before we
        # ran out of backoff times. Give an error.
        if last_error is None:
            raise BackoffError("Ran out of retries calling {}".format(
                original_function.__name__))
        else:
            raise last_error
コード例 #2
0
ファイル: iostore.py プロジェクト: glennhickey/toil-vg
    def read_input_file(self, input_path, local_path):
        """
        Get input from the filesystem.
        """

        RealtimeLogger.debug("Loading {} from FileIOStore in {} to {}".format(
            input_path, self.path_prefix, local_path))

        if os.path.exists(local_path):
            # Try deleting the existing item if it already exists
            try:
                os.unlink(local_path)
            except:
                # Don't fail here, fail complaining about the assertion, which
                # will be more informative.
                pass

        # Make sure the path is clear for copying
        assert (not os.path.exists(local_path))

        # Where is the file actually?
        real_path = os.path.abspath(os.path.join(self.path_prefix, input_path))

        if not os.path.exists(real_path):
            RealtimeLogger.error(
                "Can't find {} from FileIOStore in {}!".format(
                    input_path, self.path_prefix))
            raise RuntimeError("File {} missing!".format(real_path))

        # Make a temporary file
        temp_handle, temp_path = tempfile.mkstemp(
            dir=os.path.dirname(local_path))
        os.close(temp_handle)

        # Copy to the temp file
        shutil.copy2(real_path, temp_path)

        # Rename the temp file to the right place, atomically
        RealtimeLogger.info("rename {} -> {}".format(temp_path, local_path))
        os.rename(temp_path, local_path)

        # Look at the file stats
        file_stats = os.stat(real_path)

        if (file_stats.st_uid == os.getuid()
                and file_stats.st_mode & stat.S_IWUSR):
            # We own this file and can write to it. We don't want the user
            # script messing it up through the symlink.

            try:
                # Clear the user write bit, so the user can't accidentally
                # clobber the file in the actual store through the symlink.
                os.chmod(real_path, file_stats.st_mode ^ stat.S_IWUSR)
            except OSError:
                # If something goes wrong here (like us not having permission to
                # change permissions), ignore it.
                pass
コード例 #3
0
    def _runDeferredFunction(self, deferredFunction):
        """
        Run a deferred function (either our own or someone else's).

        Reports an error if it fails.
        """

        try:
            deferredFunction.invoke()
        except Exception as err:
            # Report this in real time, if enabled. Otherwise the only place it ends up is the worker log.
            RealtimeLogger.error("Failed to run deferred function %s: %s", repr(deferredFunction), str(err))
        except:
            RealtimeLogger.error("Failed to run deferred function %s", repr(deferredFunction))
コード例 #4
0
 def handle_reject(self, job, err):
     """
     Handle promise rejection.
     """
     
     Logger.error("Promise Rejected: {}".format(err))
     RealtimeLogger.error("Promise Rejected: {}".format(err))
     
     self.err = err
     
     # TODO: implement
     # Check if we have any reject handlers
     # If so call them
     # If not throw an error that stops the workflow
     raise err
コード例 #5
0
    def call_with_docker(self, job, args, work_dir, outfile, errfile,
                         check_output, tool_name):
        """
        
        Thin wrapper for docker_call that will use internal lookup to
        figure out the location of the docker file.  Only exposes docker_call
        parameters used so far.  expect args as list of lists.  if (toplevel)
        list has size > 1, then piping interface used
        
        Does support redirecting output to outfile, unless check_output is
        used, in which case output is captured.
        
        """

        RealtimeLogger.info(
            truncate_msg("Docker Run: {}".format(" | ".join(" ".join(x)
                                                            for x in args))))
        start_time = timeit.default_timer()

        # we use the first argument to look up the tool in the docker map
        # but allow overriding of this with the tool_name parameter
        name = tool_name if tool_name is not None else args[0][0]
        tool = self.docker_tool_map[name]

        # We keep an environment dict
        environment = {}

        # And an entry point override
        entrypoint = None

        # And a volumes dict for mounting
        volumes = {}

        # And a working directory override
        working_dir = None

        # breaks Rscript.  Todo: investigate how general this actually is
        if name != 'Rscript':
            # vg uses TMPDIR for temporary files
            # this is particularly important for gcsa, which makes massive files.
            # we will default to keeping these in our working directory
            environment['TMPDIR'] = '.'

        if name == 'Rscript':
            # The R dockers by default want to install packages in non-writable directories. Sometimes.
            # Make sure a writable directory which exists is used.
            environment['R_LIBS'] = '/tmp'

        if name == 'vg':
            environment['VG_FULL_TRACEBACK'] = '1'

        # ugly hack for platypus, as default container doesn't have executable in path
        if tool == 'quay.io/biocontainers/platypus-variant:0.8.1.1--htslib1.7_1' and \
           args[0][0] == 'Platypus.py':
            args[0][
                0] = '/usr/local/share/platypus-variant-0.8.1.1-1/Platypus.py'

        # Force all dockers to run sort in a consistent way
        environment['LC_ALL'] = 'C'

        # set our working directory map
        if work_dir is not None:
            volumes[os.path.abspath(work_dir)] = {
                'bind': '/data',
                'mode': 'rw'
            }
            working_dir = '/data'

        if outfile is not None:
            # We need to send output to a file object

            assert (not check_output)

            # We can't just redirect stdout of the container from the API, so
            # we do something more complicated.

            # Now we need to populate an FD that spits out the container output.
            output_fd = None

            # We may be able to use a FIFO, or we may need a network connection.
            # FIFO sharing between host and container only works on Linux.
            use_fifo = (platform.system() == 'Linux')

            if use_fifo:
                # On a Linux host we can just use a FIFO from the container to the host

                # Set up a FIFO to receive it
                fifo_dir = tempfile.mkdtemp()
                fifo_host_path = os.path.join(fifo_dir, 'stdout.fifo')
                os.mkfifo(fifo_host_path)

                # Mount the FIFO in the container.
                # The container doesn't actually have to have the mountpoint directory in its filesystem.
                volumes[fifo_dir] = {'bind': '/control', 'mode': 'rw'}

                # Redirect the command output by tacking on another pipeline stage
                parameters = args + [['dd', 'of=/control/stdout.fifo']]

                # Open the FIFO into nonblocking mode. See
                # <https://stackoverflow.com/a/5749687> and
                # <http://shallowsky.com/blog/programming/python-read-characters.html>
                output_fd = os.open(fifo_host_path,
                                    os.O_RDONLY | os.O_NONBLOCK)

            else:
                # On a Mac host we can't because of https://github.com/docker/for-mac/issues/483
                # We need to go over the network instead.

                # Open an IPv4 TCP socket, since we know Docker uses IPv4 only
                listen_sock = socket.socket(socket.AF_INET)
                # Bind it to an OS-selected port on all interfaces, since we can't determine the Docker interface
                # TODO: socket.INADDR_ANY ought to work here but is rejected for being an int.
                listen_sock.bind(('', 0))

                # Start listening
                listen_sock.listen(1)

                # Get the port we got given
                listen_port = listen_sock.getsockname()[1]

                # Generate a random security cookie. Since we can't really stop
                # Internet randos from connecting to our socket, we bail out on
                # any connection that doesn't start with this cookie and a newline.
                security_cookie = str(uuid.uuid4())

                # Redirect the command output to that port using Bash networking
                # Your Docker needs to be 18.03+ to support host.docker.internal
                # Your container needs to have bash with networking support
                parameters = args + [[
                    'bash', '-c',
                    'exec 3<>/dev/tcp/host.docker.internal/{}; cat <(echo {}) - >&3'
                    .format(listen_port, security_cookie)
                ]]

                RealtimeLogger.debug(
                    "Listening on port {} for output from Docker container".
                    format(listen_port))

                # We can't populate the FD until we accept, which we can't do
                # until the Docker comes up and is trying to connect.

            RealtimeLogger.debug("Final Docker command: {}".format(" | ".join(
                " ".join(x) for x in parameters)))

            # Start the container detached so we don't wait on it
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            RealtimeLogger.debug("Asked for container {}".format(container.id))

            if not use_fifo:
                # Try and accept a connection from the container.
                # Make sure there's a timeout so we don't accept forever
                listen_sock.settimeout(10)

                for attempt in range(3):

                    connection_sock, remote_address = listen_sock.accept()

                    RealtimeLogger.info(
                        "Got connection from {}".format(remote_address))

                    # Set a 10 second timeout for the cookie
                    connection_sock.settimeout(10)

                    # Check the security cookie
                    received_cookie_and_newline = connection_sock.recv(
                        len(security_cookie) + 1)

                    if received_cookie_and_newline != security_cookie + "\n":
                        # Incorrect security cookie.
                        RealtimeLogger.warning(
                            "Received incorect security cookie message from {}"
                            .format(remote_address))
                        continue
                    else:
                        # This is the container we are looking for
                        # Go into nonblocking mode which our read code expects
                        connection_sock.setblocking(True)
                        # Set the FD
                        output_fd = connection_sock.fileno()
                        break

                if output_fd is None:
                    # We can't get ahold of the Docker in time
                    raise RuntimeError(
                        "Could not establish network connection for Docker output!"
                    )

            # If the Docker container goes badly enough, it may not even open
            # the other end of the connection. So we can't just wait for it to
            # EOF before checking on the Docker.

            # Now read ought to throw if there is no data. But
            # <https://stackoverflow.com/q/38843278> and some testing suggest
            # that this doesn't happen, and it just looks like EOF. So we will
            # watch out for that.

            try:
                # Prevent leaking FDs

                # If this is set, and there is no data in the pipe, decide that no data is coming
                last_chance = False
                # If this is set, we have seen data in the pipe, so the other
                # end must have opened it and will eventually close it if it
                # doesn't run forever.
                saw_data = False

                while True:
                    # While there still might be data in the pipe

                    if output_fd is not None:
                        # Select on the pipe with a timeout, so we don't spin constantly waiting for data
                        can_read, can_write, had_error = select.select(
                            [output_fd], [], [output_fd], 10)

                    if len(can_read) > 0 or len(had_error) > 0:
                        # There is data available or something else weird about our FIFO.

                        try:
                            # Do a nonblocking read. Since we checked with select we never should get "" unless there's an EOF.
                            data = os.read(output_fd, 4096)

                            if data == "":
                                # We didn't throw and we got nothing, so it must be EOF.
                                RealtimeLogger.debug("Got EOF")
                                break

                        except OSError as err:
                            if err.errno in [errno.EAGAIN, errno.EWOULDBLOCK]:
                                # There is no data right now
                                data = None
                            else:
                                # Something else has gone wrong
                                raise err

                    else:
                        # There is no data available. Don't even try to read. Treat it as if a read refused to block.
                        data = None

                    if data is not None:
                        # Send our data to the outfile
                        outfile.write(data)
                        saw_data = True
                    elif not saw_data:
                        # We timed out and there has never been any data. Maybe the container has died/never started?

                        if last_chance:
                            # The container has been dead for a while and nothing has arrived yet. Assume no data is coming.
                            RealtimeLogger.warning(
                                "Giving up on output form container {}".format(
                                    container.id))
                            break

                        # Otherwise, check on it
                        container.reload()

                        if container.status not in [
                                'created', 'restarting', 'running', 'removing'
                        ]:
                            # The container has stopped. So what are we doing waiting around for it?

                            # Wait one last time for any lingering data to percolate through the FIFO
                            time.sleep(10)
                            last_chance = True
                            continue

            finally:
                # No matter what happens, close our end of the connection
                os.close(output_fd)

                if not use_fifo:
                    # Also close the listening socket
                    listen_sock.close()

            # Now our data is all sent.
            # Wait on the container and get its return code.
            return_code = container.wait()

            if use_fifo:
                # Clean up the FIFO files
                os.unlink(fifo_host_path)
                os.rmdir(fifo_dir)

        else:
            # No piping needed.

            if len(args) == 1:
                # split off first argument as entrypoint (so we can be oblivious as to whether
                # that happens by default)
                parameters = [] if len(args[0]) == 1 else args[0][1:]
                entrypoint = args[0][0]
            else:
                # can leave as is for piped interface which takes list of args lists
                # and doesn't worry about entrypoints since everything goes through bash -c
                # todo: check we have a bash entrypoint!
                parameters = args

            # Run the container and dump the logs if it fails.
            container = apiDockerCall(job,
                                      tool,
                                      parameters,
                                      volumes=volumes,
                                      working_dir=working_dir,
                                      entrypoint=entrypoint,
                                      environment=environment,
                                      detach=True)

            # Wait on the container and get its return code.
            return_code = container.wait()

        # When we get here, the container has been run, and stdout is either in the file object we sent it to or in the Docker logs.
        # stderr is always in the Docker logs.

        if return_code != 0:
            # What were we doing?
            command = " | ".join(" ".join(x) for x in args)

            # Dump logs
            RealtimeLogger.error(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
            RealtimeLogger.error("Dumping stderr...")
            for line in container.logs(stderr=True, stdout=False, stream=True):
                # Trim trailing \n
                RealtimeLogger.error(line[:-1])

            if not check_output and outfile is None:
                # Dump stdout as well, since it's not something the caller wanted as data
                RealtimeLogger.error("Dumping stdout...")
                for line in container.logs(stderr=False,
                                           stdout=True,
                                           stream=True):
                    # Trim trailing \n
                    RealtimeLogger.error(line[:-1])

            # Raise an error if it's not sucess
            raise RuntimeError(
                "Docker container for command {} failed with code {}".format(
                    command, return_code))
        elif errfile:
            # user wants stderr even if no crash
            for line in container.logs(stderr=True, stdout=False, stream=True):
                errfile.write(line)

        if check_output:
            # We need to collect the output. We grab it from Docker's handy on-disk buffer.
            # TODO: Bad Things can happen if the container logs too much.
            captured_stdout = container.logs(stderr=False, stdout=True)

        end_time = timeit.default_timer()
        run_time = end_time - start_time
        RealtimeLogger.info("Successfully docker ran {} in {} seconds.".format(
            " | ".join(" ".join(x) for x in args), run_time))

        if outfile:
            outfile.flush()
            os.fsync(outfile.fileno())

        if check_output is True:
            return captured_stdout
コード例 #6
0
def run_calleval_plots(job, context, names, eval_results_dict, plot_sets):
    """
    
    Make and output calleval ROC plots.
    
    Takes a "names" list of all conditions. Condition names in the list (or in
    plot_sets) do not include an "-unclipped" tag; both clipped and unclipped
    plots are made if the data is available.
    
    Takes a nested dict by condition name, then clipping status ("clipped",
    "unclipped"), and then variant type ("snp", "non_snp", "weighted").
    Eventual entries are to ROC data file ids (.tsv.gz).
    
    plot_sets is a data structure of collections of conditions to plot against
    each other, as produced by parse_plot_sets.
    
    Returns a list of created plot file IDs.
    
    """
    
    # make some roc plots
    roc_plot_ids = []
    for roc_type in ['snp', 'non_snp', 'weighted']:
        # For each type of ROC to plot
    
        for mode in ['unclipped', 'clipped']:
            # For each clipping mode

            # What kind of ROC plot is it?
            # It should be this unless there is a clipped mode for this ROC and we are the unclipped one.
            roc_kind = roc_type
            if mode == 'unclipped' and True in ['clipped' in eval_results_dict for eval_result in eval_results_dict.values()]:
                # We are the unclipped mode and there will be a clipped mode
                roc_kind += '-unclipped'
            
            # Get all the eval results for this mode, by condition name
            mode_results = {name: eval_result.get(mode) for name, eval_result in list(eval_results_dict.items())}
            
            if None in iter(mode_results.values()):
                # We can't do this mode since it wasn't run
                continue
                
            # Extract out all the stats file IDs for this ROC type, by
            # condition name. This dict is implicitly for the clipped or
            # unclipped mode, depending on which mode we are doing. The
            # condition name keys don't have clipping tags.
            roc_table_ids = {name: result.get(roc_type) for name, result in list(mode_results.items())}
            
            for subset_number, plot_set in enumerate(plot_sets):
                # For each plot set specifier
                
                # Unpack plot_set
                plot_title, plot_conditions = plot_set
                
                if plot_conditions is None:
                    # Use all the conditions instead
                    plot_conditions = names
                    
                # Add the ROC plot kind (snp, clipped, etc.) to the plot title
                if plot_title is None:
                    plot_title = roc_kind
                else:
                    plot_title = '{} ({})'.format(plot_title, roc_kind)
                
                for name in plot_conditions:
                    if name not in names:
                        # Complain if any of the names in the subset aren't conditions that actually ran
                        message = 'Condition {} not found in list of available conditions {}'.format(name, names)
                        RealtimeLogger.error(message)
                        raise RuntimeError(message)
                    if name not in roc_table_ids:
                        # Complain if any of the names in the subset lacks ROC data
                        message = 'Condition {} has no ROC data; data only available for {}'.format(name, list(roc_table_ids.keys()))
                        RealtimeLogger.error(message)
                        raise RuntimeError(message)

                # Make a list of roc table IDs for the subset, in plot_conditions order
                subset_ids = [roc_table_ids[name] for name in plot_conditions]
                
                # Make the plot
                roc_plot_ids.append(job.addChildJobFn(run_vcfeval_roc_plot, context, subset_ids, names=plot_conditions,
                                                      kind=roc_kind, number=subset_number, title=plot_title).rv())
                                                      
    return roc_plot_ids
コード例 #7
0
def run_bam_caller(job, context, fasta_file_id, bam_file_id, bam_idx_id,
                   sample_name, chrom, offset, out_name, bam_caller, bam_caller_opts):
    """
    run freebayes or platypus to make a vcf
    """

    assert bam_caller in ['freebayes', 'platypus']
    
    # make a local work directory
    work_dir = job.fileStore.getLocalTempDir()

    # download the input
    fasta_path = os.path.join(work_dir, 'ref.fa')
    bam_path = os.path.join(work_dir, 'alignment.bam')
    bam_idx_path = bam_path + '.bai'
    job.fileStore.readGlobalFile(fasta_file_id, fasta_path)
    job.fileStore.readGlobalFile(bam_file_id, bam_path)
    job.fileStore.readGlobalFile(bam_idx_id, bam_idx_path)

    # output
    vcf_path = os.path.join(work_dir, '{}-raw.vcf'.format(out_name))
    
    if bam_caller == 'freebayes':
        fb_cmd = ['freebayes', '-f', os.path.basename(fasta_path), os.path.basename(bam_path)]
        if bam_caller_opts:
            fb_cmd += bam_caller_opts
        if chrom:
            fb_cmd += ['-r', chrom]
        timer = TimeTracker('freebayes')
        with open(vcf_path, 'wb') as out_vcf:
            context.runner.call(job, fb_cmd, work_dir=work_dir, outfile=out_vcf)
        timer.stop()
            
    elif bam_caller == 'platypus':
        context.runner.call(job, ['samtools', 'faidx', 'ref.fa'], work_dir=work_dir)
        plat_cmd = ['Platypus.py', 'callVariants', '--refFile', os.path.basename(fasta_path),
                    '--bamFiles', os.path.basename(bam_path), '-o', os.path.basename(vcf_path)]
        if bam_caller_opts:
            plat_cmd += bam_caller_opts
        if chrom:
            plat_cmd += ['--regions', chrom]
        timer = TimeTracker('Platypus')
        context.runner.call(job, plat_cmd, work_dir=work_dir)
        timer.stop()
    else:
        assert False

    context.write_intermediate_file(job, vcf_path)

    vcf_fix_path = os.path.join(work_dir, '{}.vcf'.format(out_name))
    
    # apply offset and sample name
    vcf_reader = vcf.Reader(open(vcf_path))
    if sample_name:
        # Freebayes always outputs "unknown" for the sample if the BAM
        # doesn't specify a name, and can't be convinced to do otherwise:
        # https://github.com/ekg/freebayes/issues/471
        # So we hack the VCFReader to think the sample names are what we want them to be
        assert(len(vcf_reader.samples) == 1)
        # TODO: asserting that sample 0 is named unknown can fail. Why?
        RealtimeLogger.info('Correcting Freebayes samples {} to [{}]'.format(vcf_reader.samples, sample_name))
        vcf_reader.samples = [sample_name]
        # Rebuild the secret sample index
        vcf_reader._sample_indexes = {sample_name: 0}
    vcf_writer = vcf.Writer(open(vcf_fix_path, 'w'), vcf_reader)
    
    # We insist that Freebayes produce at least one record, or we will abort and dump files.
    have_records = False
    for record in vcf_reader:
        have_records = True
        if offset:
            record.POS += int(offset)
        vcf_writer.write_record(record)
    vcf_writer.flush()
    vcf_writer.close()
    
    if not have_records:
        # This is a problem. Stop here for debugging.
        RealtimeLogger.error('{} produced no calls. Dumping files...'.format(bam_caller))
        for dump_path in [fasta_path, bam_path, bam_idx_path]:
            if dump_path and os.path.isfile(dump_path):
                context.write_output_file(job, dump_path)
        raise RuntimeError('{} produced no calls'.format(bam_caller))

    context.runner.call(job, ['bgzip', os.path.basename(vcf_fix_path)], work_dir = work_dir)
    context.runner.call(job, ['tabix', '-p', 'vcf', os.path.basename(vcf_fix_path) + '.gz'], work_dir = work_dir)

    return (context.write_output_file(job, vcf_fix_path + '.gz'),
            context.write_output_file(job, vcf_fix_path + '.gz.tbi'),
            timer)