Exemple #1
0
def parse_samples(path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """
    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if line.isspace() or line.startswith('#'):
                continue
            sample = line.strip().split('\t')
            if len(sample) != 2:
                raise UserError(
                    'Bad manifest format! Expected 2 tab separated columns, got: {}'
                    .format(sample))

            # If a directory is passed in, use all samples in that directory
            uuid, url = sample
            if urlparse(url).scheme == '':
                url = [
                    'file://' + os.path.join(url, x) for x in os.listdir(url)
                ]
            # If url is a tarball
            elif url.endswith('tar.gz') or url.endswith('tar'):
                require(
                    urlparse(url).scheme in SCHEMES,
                    'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES))
                url = [url]
            # If URL is a fastq or series of fastqs
            elif url.endswith('fastq.gz') or url.endswith(
                    'fastq') or url.endswith('fq.gz') or url.endswith('fq'):
                url = url.split(',')
                [
                    require(
                        urlparse(x).scheme in SCHEMES,
                        'URL "{}" not valid. Schemes:{}'.format(url, SCHEMES))
                    for x in url
                ]
            else:
                raise UserError(
                    'URL does not have approved extension: .tar.gz, .tar, .fastq.gz, .fastq, .fq.gz, .fq'
                )

            sample = [uuid, url]
            samples.append(sample)
    return samples
    def _run_toil_marginPhase(self, identifier, partition_size,
                              partition_margin):
        #prep
        jobStore = os.path.join(self.workdir, 'toil-jobstore')
        work_dir = os.path.join(self.workdir, 'toil-workdir')
        os.mkdir(work_dir)

        # run toil
        toil_command = [
            'toil-marginphase', 'run', '--config',
            self._generate_config(partition_size,
                                  partition_margin), '--manifest',
            self._generate_manifest(identifier), '--workDir', work_dir,
            jobStore
        ]
        log.info('Running %r', toil_command)
        subprocess.check_call(toil_command)

        # validate output
        extract_command = ['tar', 'xvf', "{}.tar.gz".format(identifier)]
        subprocess.check_call(extract_command, cwd=self.toil_outputdir)
        output_vcf_name = MarginPhaseTest.OUT_TOIL_VCF_FORMAT.format(
            identifier)
        full_merged_vcf = os.path.join(self.toil_outputdir, output_vcf_name)
        if not os.path.isfile(full_merged_vcf):
            contents = subprocess.check_output(['ls', '-la'],
                                               cwd=self.toil_outputdir)
            raise UserError(
                "toil output vcf '{}' not found in directory '{}' with contents:\n{}"
                .format(output_vcf_name, self.toil_outputdir, contents))

        # save and return
        self.toil_full_merged_vcf = full_merged_vcf
        return full_merged_vcf
Exemple #3
0
def parse_samples(config, path_to_manifest):
    """
    Parses samples, specified in either a manifest or listed with --samples

    :param str path_to_manifest: Path to configuration file
    :return: Samples and their attributes as defined in the manifest
    :rtype: list[list]
    """

    samples = []
    with open(path_to_manifest, 'r') as f:
        for line in f.readlines():
            if line.isspace() or line.startswith('#'):
                continue
            sample = line.strip().split('\t')

            # validate structure
            if len(sample) < 2:
                raise UserError(
                    'Bad manifest format! Required at least 2 tab-separated columns, got: {}'
                    .format(sample))
            if len(sample) > 6:
                raise UserError(
                    'Bad manifest format! Required at most 6 tab-separated columns, got: {}'
                    .format(sample))

            # extract sample parts
            uuid = sample[0]
            url = sample[1]
            contig_name, reference_url, params_url = "", "", ""
            if len(sample) > 2: contig_name = sample[2]
            if len(sample) > 3: reference_url = sample[3]
            if len(sample) > 4: params_url = sample[4]

            # fill defaults
            if len(contig_name) == 0: contig_name = config.default_contig
            if len(reference_url) == 0:
                reference_url = config.default_reference
            if len(params_url) == 0: params_url = config.default_params

            sample = [uuid, url, contig_name, reference_url, params_url]
            samples.append(sample)
    return samples
 def _get_mount_path(self):
     """
     Returns the path of the mount point of the current container. If this method is invoked
     outside of a Docker container a NotInsideContainerError is raised. Likewise if the docker
     daemon is unreachable from inside the container a UserError is raised. This method is
     idempotent.
     """
     if self._mount_path is None:
         name = current_docker_container_id()
         if dockerd_is_reachable():
             # Get name of mounted volume
             blob = json.loads(
                 subprocess.check_output(['docker', 'inspect', name]))
             mounts = blob[0]['Mounts']
             # Ensure docker.sock is mounted correctly
             sock_mnt = [
                 x['Source'] == x['Destination'] for x in mounts
                 if 'docker.sock' in x['Source']
             ]
             require(
                 len(sock_mnt) == 1,
                 'Missing socket mount. Requires the following: '
                 'docker run -v /var/run/docker.sock:/var/run/docker.sock')
             # Ensure formatting of command for 2 mount points
             if len(mounts) == 2:
                 require(
                     all(x['Source'] == x['Destination'] for x in mounts),
                     'Docker Src/Dst mount points, invoked with the -v argument, '
                     'must be the same if only using one mount point aside from the docker '
                     'socket.')
                 work_mount = [
                     x['Source'] for x in mounts
                     if 'docker.sock' not in x['Source']
                 ]
             else:
                 # Ensure only one mirror mount exists aside from docker.sock
                 mirror_mounts = [
                     x['Source'] for x in mounts
                     if x['Source'] == x['Destination']
                 ]
                 work_mount = [
                     x for x in mirror_mounts if 'docker.sock' not in x
                 ]
                 require(
                     len(work_mount) == 1,
                     'Wrong number of mirror mounts provided, see '
                     'documentation.')
             self._mount_path = work_mount[0]
             log.info('The work mount is: %s', self._mount_path)
         else:
             raise UserError(
                 'Docker daemon is not reachable, ensure Docker is being run with: '
                 '"-v /var/run/docker.sock:/var/run/docker.sock" as an argument.'
             )
     return self._mount_path
def require_docker_file_output(job,
                               config,
                               work_dir,
                               output_filenames,
                               function_id,
                               log_filename=None,
                               max_directory_contents=None,
                               max_log_lines=None):
    missing_filenames = list(
        filter(lambda x: not os.path.exists(os.path.join(work_dir, x)),
               output_filenames))
    if len(missing_filenames) > 0:
        # document missing
        log(job, "Missing files after docker call: ", config.uuid, function_id)
        for missing in missing_filenames:
            log(job, "\t{}".format(missing), config.uuid, function_id)

        # document contents
        directory_contents = os.listdir(work_dir)
        if max_directory_contents is not None and len(
                directory_contents) > max_directory_contents:
            directory_contents = directory_contents[0:max_directory_contents]
            directory_contents.append("[{} items total]".format(
                len(directory_contents)))
        log(job, "Current files in work_dir: {}".format(work_dir), config.uuid,
            function_id)
        for missing in directory_contents:
            log(job, "\t{}".format(missing), config.uuid, function_id)

        # document log
        if log_filename is not None:
            log_location = os.path.join(work_dir, log_filename)
            if os.path.isfile(log_location):
                log(job, "Log file contents: {}".format(log_filename),
                    config.uuid, function_id)
                log_lines = 0
                with open(log_location) as log_stream:
                    for ll in log_stream:
                        if max_log_lines is None or log_lines < max_log_lines:
                            log(job, "\t{}".format(ll.rstrip()), config.uuid,
                                function_id)
                        log_lines += 1
                if max_log_lines is not None and log_lines <= max_log_lines:
                    log(job, "\t[{} lines total]".format(log_lines),
                        config.uuid, function_id)
        else:
            log(job, "Log file {} was not found".format(log_filename),
                config.uuid, function_id)

        # die
        raise UserError("Missing files after running {} on {}: {}".format(
            function_id, config.uuid, missing_filenames))
    def _run_docker_marginPhase(self, identifier):
        # prep
        shutil.copy(
            os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_FA),
            os.path.join(self.exec_outputdir, MarginPhaseTest.IN_REF_FA))
        shutil.copy(
            os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_VCF),
            os.path.join(self.exec_outputdir, MarginPhaseTest.IN_REF_VCF))
        shutil.copy(os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_BAM),
                    os.path.join(self.exec_outputdir, MarginPhaseTest.IN_BAM))
        shutil.copy(
            os.path.join(TOIL_TEST_DIR, MarginPhaseTest.IN_PARAMS),
            os.path.join(self.exec_outputdir, MarginPhaseTest.IN_PARAMS))

        # run docker
        docker_command = [
            'docker',
            'run',
            '--rm',
            '-v',
            "{}:/data".format(self.exec_outputdir),  # '-it',
            MarginPhaseTest.DOCKER_MARGIN_PHASE,
            "/data/{}".format(MarginPhaseTest.IN_BAM),
            "/data/{}".format(MarginPhaseTest.IN_REF_FA),
            "-p",
            "/data/{}".format(MarginPhaseTest.IN_PARAMS),
            "-o",
            "/data/{}".format(identifier),
            # "-r", "/data/{}".format(MarginPhaseTest.IN_REF_VCF),
            "-a",
            "info",
            "-v",
            "0"
        ]
        log.info('Running %r', docker_command)
        subprocess.check_call(docker_command)

        # validate output
        output_vcf_name = MarginPhaseTest.OUT_EXEC_VCF_FORMAT.format(
            identifier)
        output_vcf = os.path.join(self.exec_outputdir, output_vcf_name)
        if not os.path.isfile(output_vcf):
            contents = subprocess.check_output(['ls', '-la'],
                                               cwd=self.exec_outputdir)
            raise UserError(
                "exec output vcf '{}' not found in directory '{}' with contents:\n{}"
                .format(output_vcf_name, self.exec_outputdir, contents))

        # save and return
        self.exec_output_vcf = output_vcf
        return output_vcf
Exemple #7
0
def main():
    """
    Computational Genomics Lab, Genomics Institute, UC Santa Cruz
    MarginPhase pipeline

    =======================================
    Dependencies
    Curl:       apt-get install curl
    Docker:     wget -qO- https://get.docker.com/ | sh
    Toil:       pip install toil
    Boto:       pip install boto (OPTIONAL)
    """

    parser = argparse.ArgumentParser(
        description=main.__doc__,
        formatter_class=argparse.RawTextHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')

    # Generate subparsers
    subparsers.add_parser(
        'generate-config',
        help='Generates an editable config in the current working directory.')
    subparsers.add_parser(
        'generate-manifest',
        help='Generates an editable manifest in the current working directory.'
    )
    subparsers.add_parser(
        'generate',
        help='Generates a config and manifest in the current working directory.'
    )

    # Run subparser
    parser_run = subparsers.add_parser('run',
                                       help='Runs the MarginPhase pipeline')
    group = parser_run.add_mutually_exclusive_group()
    parser_run.add_argument(
        '--config',
        default=DEFAULT_CONFIG_NAME,
        type=str,
        help=
        'Path to the (filled in) config file, generated with "generate-config". '
        '\nDefault value: "%(default)s"')
    group.add_argument(
        '--manifest',
        default=DEFAULT_MANIFEST_NAME,
        type=str,
        help=
        'Path to the (filled in) manifest file, generated with "generate-manifest". '
        '\nDefault value: "%(default)s"')

    # If no arguments provided, print full help menu
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # Add Toil options
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    # Parse subparsers related to generation of config and manifest
    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_CONFIG_NAME), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, DEFAULT_MANIFEST_NAME),
                      generate_manifest)

    # Pipeline execution
    elif args.command == 'run':
        # sanity check
        require(
            os.path.exists(args.config), '{} not found. Please run '
            '"toil-marginphase generate-config"'.format(args.config))
        require(
            os.path.exists(args.manifest),
            '{} not found and no samples provided. Please '
            'run "toil-marginphase generate-manifest"'.format(args.manifest))

        # Parse config
        parsed_config = {
            x.replace('-', '_'): y
            for x, y in yaml.load(open(args.config).read()).iteritems()
        }
        config = argparse.Namespace(**parsed_config)
        config.maxCores = int(args.maxCores) if args.maxCores else sys.maxsize
        config.defaultCores = int(min(MP_CPU, config.maxCores))
        config.maxDisk = int(args.maxDisk) if args.maxDisk else sys.maxint
        config.maxMemory = sys.maxint
        # fix parsing of GB to int
        if args.maxMemory:
            args.maxMemory = args.maxMemory.upper()
            if args.maxMemory.endswith('B'):
                args.maxMemory = args.maxMemory.rstrip('B')
            # actual parsing
            if args.maxMemory.endswith('G'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('G')) * 1024 * 1024 * 1024
            elif args.maxMemory.endswith('M'):
                config.maxMemory = int(
                    args.maxMemory.rstrip('M')) * 1024 * 1024
            elif args.maxMemory.endswith('K'):
                config.maxMemory = int(args.maxMemory.rstrip('K')) * 1024
            else:
                config.maxMemory = int(args.maxMemory)

        # Config sanity checks
        require(config.output_dir, 'No output location specified')
        if urlparse(config.output_dir).scheme != "s3":
            config.output_dir = config.output_dir.replace("file://", "", 1)
            mkdir_p(config.output_dir)
        if not config.output_dir.endswith('/'):
            config.output_dir += '/'
        require(config.partition_size,
                "Configuration parameter partition-size is required")
        require(config.partition_margin,
                "Configuration parameter partition-margin is required")

        if 'save_intermediate_files' not in config or not config.save_intermediate_files:
            config.intermediate_file_location = None
        elif urlparse(config.output_dir).scheme == "s3":
            raise UserError(
                "Config parameter 'save_intermediate_files' cannot be used with s3 output directory"
            )
        else:
            intermediate_location = os.path.join(
                config.output_dir, "intermediate",
                datetime.datetime.now().strftime("%Y%m%d_%H%M%S"))
            mkdir_p(intermediate_location)
            config.intermediate_file_location = intermediate_location
        if "margin_phase_image" not in config or len(
                config.margin_phase_image) == 0:
            config.margin_phase_image = DOCKER_MARGIN_PHASE_IMG_DEFAULT
        if "margin_phase_tag" not in config or len(
                config.margin_phase_tag) == 0:
            config.margin_phase_tag = DOCKER_MARGIN_PHASE_TAG_DEFAULT
        if "cpecan_image" not in config or len(config.cpecan_image) == 0:
            config.cpecan_image = DOCKER_CPECAN_IMG_DEFAULT
        if "cpecan_tag" not in config or len(config.cpecan_tag) == 0:
            config.cpecan_tag = DOCKER_CPECAN_TAG_DEFAULT
        if "unittest" not in config:
            config.unittest = False
        if "minimal_output" not in config:
            config.minimal_output = False
        if "minimal_cpecan_output" not in config:
            config.minimal_cpecan_output = False
        if "cpecan_probabilities" not in config:
            config.cpecan_probabilities = False

        # get samples
        samples = parse_samples(config, args.manifest)

        # Program checks
        for program in ['docker']:
            require(
                next(which(program), None),
                program + ' must be installed on every node.'.format(program))

        # Start the workflow
        Job.Runner.startToil(
            Job.wrapJobFn(map_job, prepare_input, samples, config), args)
Exemple #8
0
def merge_chunks(job, config, chunk_infos):
    # prep
    start = time.time()
    uuid = config.uuid
    work_dir = job.fileStore.getLocalTempDir()
    log(job, "{}".format(datetime.datetime.now()), uuid, 'merge_chunks')
    log(job, "Merging {} chunks".format(len(chunk_infos)), uuid,
        'merge_chunks')
    if config.minimal_output:
        log(
            job,
            "Minimal output is configured, will only save full chromosome vcf and merged BAMs",
            uuid, 'merge_chunks')

    # work directory for tar management
    # output files
    merged_chunks_directory = os.path.join(work_dir, ID_MERGED)
    os.mkdir(merged_chunks_directory)
    full_merged_vcf_file = os.path.join(merged_chunks_directory,
                                        "{}.merged.vcf".format(config.uuid))
    full_merged_sam_file = os.path.join(merged_chunks_directory,
                                        "{}.merged.sam".format(config.uuid))

    # sort by chunk index and validate
    chunk_infos.sort(key=(lambda x: x[CI_CHUNK_INDEX]))
    idx = 0
    missing_indices = []
    for ci in chunk_infos:
        while ci[CI_CHUNK_INDEX] > idx:
            missing_indices.append(idx)
            idx += 1
        idx += 1
    if len(missing_indices) > 0:
        log(
            job, "Found {} missing indices: {}".format(len(missing_indices),
                                                       missing_indices), uuid,
            'merge_chunks')

    # prep for iteration
    merge_decisions = dict()
    prev_chunk_workdir = ""
    prev_chunk_sam_file = None
    prev_chunk_vcf_file = None
    prev_chunk = {CI_CHUNK_INDEX: "start"}
    prev_written_reads = set()
    prev_vcf_split_pos = None
    prev_vcf_phase_action = None

    # iterate over all chunks
    for chunk in chunk_infos:

        # get current chunk info/files
        chunk_idx = chunk[CI_CHUNK_INDEX]
        chunk_boundary = chunk[CI_CHUNK_BOUNDARY_START]
        merging_step_identifier = "{}:{}-{}".format(config.uuid,
                                                    prev_chunk[CI_CHUNK_INDEX],
                                                    chunk[CI_CHUNK_INDEX])
        curr_chunk_workdir = os.path.join(work_dir, "tmp-{}".format(chunk_idx))
        curr_chunk_sam_file, curr_chunk_vcf_file = merge_chunks__extract_chunk_tarball(
            job, config, curr_chunk_workdir, chunk)
        log(
            job, "merging {} and {} across boundary {}".format(
                prev_chunk[CI_CHUNK_INDEX], chunk_idx, chunk_boundary), uuid,
            'merge_chunks')

        # error out if missing files
        if curr_chunk_sam_file is None or curr_chunk_vcf_file is None:
            error = "{}: Missing expected output file, sam:{}, vcf:{}, chunk_info:{}".format(
                chunk_idx, curr_chunk_sam_file, curr_chunk_vcf_file, chunk)
            log(job, error, uuid, 'merge_chunks')
            job.fileStore.logToMaster(error)
            if CONTINUE_AFTER_FAILURE:
                # prev chunk info is maintained, and will be written during next chunk
                continue
            raise UserError("{}:{}".format(uuid, error))

        # skip writing the first chunk
        if prev_chunk_sam_file is None:
            curr_written_reads = set()
            curr_vcf_split_pos = 0
            curr_vcf_phase_action = dict()

        # write the rest of the chunks
        else:
            # get chunk splitting
            prev_reads, curr_reads, curr_vcf_split_pos, curr_vcf_phase_action, decision_summary =\
                merge_chunks__determine_chunk_splitting(job, merging_step_identifier, prev_chunk_sam_file,
                                                        curr_chunk_sam_file, chunk_boundary)
            merge_decisions[decision_summary] =\
                merge_decisions[decision_summary] + 1 if decision_summary in merge_decisions else 1

            # write sam
            curr_written_reads = merge_chunks__append_sam_reads(
                job, merging_step_identifier, prev_chunk_sam_file,
                full_merged_sam_file, prev_reads, prev_written_reads)
            if len(curr_reads) > 0:
                curr_written_right_reads = merge_chunks__append_sam_reads(
                    job, merging_step_identifier, curr_chunk_sam_file,
                    full_merged_sam_file, curr_reads, curr_written_reads)
                curr_written_reads = curr_written_reads.union(
                    curr_written_right_reads)

            # write vcf
            merge_chunks__append_vcf_calls(
                job,
                merging_step_identifier,
                prev_chunk_vcf_file,
                full_merged_vcf_file,
                prev_vcf_split_pos,
                curr_vcf_split_pos,
                prev_vcf_phase_action,
                mp_identifier=prev_chunk[CI_CHUNK_INDEX])

        # cleanup
        if os.path.isdir(prev_chunk_workdir):
            shutil.rmtree(prev_chunk_workdir)

        # iterate
        prev_chunk = chunk
        prev_chunk_workdir = curr_chunk_workdir
        prev_chunk_sam_file = curr_chunk_sam_file
        prev_chunk_vcf_file = curr_chunk_vcf_file
        prev_written_reads = curr_written_reads
        prev_vcf_split_pos = curr_vcf_split_pos
        prev_vcf_phase_action = curr_vcf_phase_action

    # write the final reads and calls
    merging_step_identifier = "{}:{}-{}".format(config.uuid,
                                                prev_chunk[CI_CHUNK_INDEX],
                                                "end")
    merge_chunks__append_sam_reads(job, merging_step_identifier,
                                   prev_chunk_sam_file, full_merged_sam_file,
                                   {None: None}, prev_written_reads)
    merge_chunks__append_vcf_calls(job,
                                   merging_step_identifier,
                                   prev_chunk_vcf_file,
                                   full_merged_vcf_file,
                                   prev_vcf_split_pos,
                                   sys.maxint,
                                   prev_vcf_phase_action,
                                   mp_identifier=prev_chunk[CI_CHUNK_INDEX])

    # loggit
    log(job, "Finished merge with following matches:", uuid, 'merge_chunks')
    job.fileStore.logToMaster("{}:merge_chunks: ".format(config.uuid))
    for decision, count in merge_decisions.items():
        log(job, "\t\t{}: \t{}".format(decision, count), uuid, 'merge_chunks')

    # tarball the output and save
    log(job, "Output files for merge:".format(), uuid, 'merge_chunks')
    output_file_locations = glob.glob(
        os.path.join(merged_chunks_directory, "*.*"))
    output_file_locations.sort()
    tmp = output_file_locations
    output_file_locations = list()
    for f in tmp:
        if os.path.isdir(f):
            log(job, "\t\t{} (skipped, directory)".format(os.path.basename(f)),
                uuid, 'merge_chunks')
        else:
            log(job, "\t\t{}".format(os.path.basename(f)), uuid,
                'merge_chunks')
            output_file_locations.append(f)
    tarball_name = "{}.merged.tar.gz".format(config.uuid)
    tarball_files(tar_name=tarball_name,
                  file_paths=output_file_locations,
                  output_dir=work_dir)
    output_file_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, tarball_name))
    # we need to return the input list of chunk infos for consolidation
    chunk_infos.append({
        CI_UUID: config.uuid,
        CI_OUTPUT_FILE_ID: output_file_id,
        CI_CHUNK_INDEX: ID_MERGED
    })

    log_generic_job_debug(job, config.uuid, "merge_chunks", work_dir=work_dir)
    log_time(job, "merge_chunks", start, config.uuid)
    return chunk_infos
Exemple #9
0
def prepare_input(job, sample, config, enqueue_consolidation=True):

    # job prep
    config = argparse.Namespace(**vars(config))
    uuid, url, contig_name, reference_url, params_url = sample
    config.uuid = uuid
    config.contig_name = contig_name
    config.reference_url = reference_url
    config.params_url = params_url
    if config.intermediate_file_location is not None:
        config.intermediate_file_location = os.path.join(
            config.intermediate_file_location, uuid)
        mkdir_p(config.intermediate_file_location)
    work_dir = job.fileStore.getLocalTempDir()
    start = time.time()
    log(job, "{}".format(datetime.datetime.now()), config.uuid, 'START')
    log(
        job,
        "Preparing input with URL:{}, contig:{}, reference_url:{}, params_url:{}"
        .format(url, contig_name, reference_url,
                params_url), uuid, 'prepare_input')

    # todo global resource estimation
    config.maxCores = min(config.maxCores, multiprocessing.cpu_count())
    config.defaultCores = min(MP_CPU, config.maxCores)
    config.maxMemory = min(config.maxMemory, int(physicalMemory() * .95))
    #config.disk

    # download references - TOIL_JOBSTORE_PROTOCOL queries are so this function can be imported

    #ref fasta
    if reference_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        ref_genome_fileid = reference_url.replace(TOIL_JOBSTORE_PROTOCOL, '',
                                                  1)
        ref_genome_filename = "{}.reference.{}.fa".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            ref_genome_fileid, os.path.join(work_dir, ref_genome_filename))
    else:
        download_url(reference_url, work_dir=work_dir)
        ref_genome_filename = os.path.basename(reference_url)
        ref_genome_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, ref_genome_filename))
    ref_genome_size = os.stat(os.path.join(work_dir,
                                           ref_genome_filename)).st_size
    config.reference_genome_fileid = ref_genome_fileid

    #params
    if params_url.startswith(TOIL_JOBSTORE_PROTOCOL):
        params_fileid = params_url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1)
    else:
        download_url(params_url, work_dir=work_dir)
        params_filename = os.path.basename(params_url)
        params_fileid = job.fileStore.writeGlobalFile(
            os.path.join(work_dir, params_filename))
    config.params_fileid = params_fileid

    # download bam
    if url.startswith(TOIL_JOBSTORE_PROTOCOL):
        bam_filename = "{}.input.{}.bam".format(uuid, contig_name)
        job.fileStore.readGlobalFile(
            url.replace(TOIL_JOBSTORE_PROTOCOL, '', 1),
            os.path.join(work_dir, bam_filename))
    else:
        download_url(url, work_dir=work_dir)
        bam_filename = os.path.basename(url)
    data_bam_location = os.path.join("/data", bam_filename)
    workdir_bam_location = os.path.join(work_dir, bam_filename)

    # index the bam
    _index_bam(job, config, work_dir, bam_filename)

    # sanity check
    workdir_bai_location = os.path.join(work_dir, bam_filename + ".bai")
    if not os.path.isfile(workdir_bai_location):
        raise UserError("BAM index file not created for {}: {}".format(
            bam_filename, workdir_bai_location))

    # get start and end location
    start_idx = sys.maxint
    end_idx = 0
    with closing(
            pysam.AlignmentFile(
                workdir_bam_location,
                'rb' if bam_filename.endswith("bam") else 'r')) as aln:
        for read in aln.fetch():
            align_start = read.reference_start
            align_end = read.reference_end
            start_idx = min([start_idx, align_start])
            end_idx = max([end_idx, align_end])
    log(job, "start_pos:{}, end_pos:{}".format(config.uuid, start_idx,
                                               end_idx), uuid, 'prepare_input')

    # get reads from positions
    chunk_infos = list()
    idx = start_idx
    while idx < end_idx:
        ci = {CI_UUID: uuid}
        ci[CI_CHUNK_BOUNDARY_START] = idx
        chunk_start = idx - config.partition_margin
        ci[CI_CHUNK_START] = chunk_start
        idx += config.partition_size
        ci[CI_CHUNK_BOUNDARY_END] = idx
        chunk_end = idx + config.partition_margin
        ci[CI_CHUNK_END] = chunk_end
        chunk_infos.append(ci)

    # enqueue jobs
    log(job, "Enqueueing {} jobs".format(len(chunk_infos)), uuid,
        'prepare_input')
    idx = 0
    enqueued_jobs = 0
    returned_tarballs = list()
    for ci in chunk_infos:
        #prep
        ci[CI_CHUNK_INDEX] = idx
        chunk_start = ci[CI_CHUNK_START]
        chunk_end = ci[CI_CHUNK_END]
        chunk_position_description = "{}:{}-{}".format(config.contig_name,
                                                       chunk_start, chunk_end)
        bam_split_command = [
            "view", "-b", data_bam_location, chunk_position_description
        ]
        chunk_name = "{}.{}.bam".format(config.uuid, idx)

        #write chunk
        chunk_location = os.path.join(work_dir, chunk_name)
        with open(chunk_location, 'w') as out:
            docker_call(job,
                        config,
                        work_dir,
                        bam_split_command,
                        DOCKER_SAMTOOLS_IMG,
                        DOCKER_SAMTOOLS_TAG,
                        outfile=out)

        #document read count
        chunk_size = os.stat(chunk_location).st_size
        ci[CI_CHUNK_SIZE] = chunk_size
        ci[CI_REF_FA_SIZE] = ref_genome_size
        read_count = prepare_input__get_bam_read_count(job, work_dir,
                                                       chunk_name)
        ci[CI_READ_COUNT] = read_count
        log(
            job,
            "chunk from {} for idx {} is {}b ({}mb) and has {} reads".format(
                chunk_position_description, idx, chunk_size,
                int(chunk_size / 1024 / 1024),
                read_count), uuid, 'prepare_input')
        if config.intermediate_file_location is not None:
            copy_files(file_paths=[chunk_location],
                       output_dir=config.intermediate_file_location)

        # enqueue marginPhase job
        if read_count > 0:
            chunk_fileid = job.fileStore.writeGlobalFile(chunk_location)
            mp_cores = config.defaultCores
            mp_mem = int(
                min(
                    int(chunk_size * MP_MEM_BAM_FACTOR +
                        ref_genome_size * MP_MEM_REF_FACTOR),
                    config.maxMemory))
            mp_disk = int(
                min(
                    int(chunk_size * MP_DSK_BAM_FACTOR +
                        ref_genome_size * MP_DSK_REF_FACTOR +
                        (0 if config.cpecan_probabilities else
                         MP_DSK_CPECAN_FACTOR) * chunk_size), config.maxDisk))
            log(
                job,
                "requesting {} cores, {}b ({}mb) disk, {}b ({}gb) mem".format(
                    mp_cores, mp_disk, int(mp_disk / 1024 / 1024), mp_mem,
                    int(mp_mem / 1024 / 1024 / 1024)),
                "{}.{}".format(uuid, idx), 'prepare_input')
            mp_mem = str(int(mp_mem / 1024)) + "K"
            mp_disk = str(int(mp_disk) / 1024) + "K"
            margin_phase_job = job.addChildJobFn(run_margin_phase,
                                                 config,
                                                 chunk_fileid,
                                                 ci,
                                                 memory=mp_mem,
                                                 cores=mp_cores,
                                                 disk=mp_disk)
            returned_tarballs.append(margin_phase_job.rv())
            enqueued_jobs += 1
        idx += 1

    log(job, "Enqueued {} jobs".format(enqueued_jobs), uuid, 'prepare_input')

    # enqueue merging and consolidation job
    merge_job = job.addFollowOnJobFn(merge_chunks, config, returned_tarballs)
    final_return_value = merge_job.rv()
    if enqueue_consolidation:
        consolidation_job = merge_job.addFollowOnJobFn(consolidate_output,
                                                       config, merge_job.rv())
        final_return_value = consolidation_job.rv()

    # log
    log_generic_job_debug(job, config.uuid, 'prepare_input', work_dir=work_dir)
    log_time(job, "prepare_input", start, config.uuid)

    # return appropriate output
    return final_return_value
Exemple #10
0
def run_margin_phase(job, config, chunk_file_id, chunk_info):
    # prep
    start = time.time()
    work_dir = job.fileStore.getLocalTempDir()
    chunk_idx = chunk_info[CI_CHUNK_INDEX]
    chunk_identifier = "{}.{}".format(config.uuid, chunk_idx)
    chunk_name = "{}.in.bam".format(chunk_identifier)
    chunk_location = os.path.join(work_dir, chunk_name)
    log(job, str(datetime.datetime.now()), chunk_identifier,
        'run_margin_phase')

    # download bam chunk
    job.fileStore.readGlobalFile(chunk_file_id, chunk_location)
    if not os.path.isfile(chunk_location):
        raise UserError("Failed to download chunk {} from {}".format(
            chunk_name, chunk_file_id))

    # download references
    #ref genome
    genome_reference_name = "reference.fa"
    genome_reference_location = os.path.join(work_dir, genome_reference_name)
    job.fileStore.readGlobalFile(config.reference_genome_fileid,
                                 genome_reference_location)
    if not os.path.isfile(genome_reference_location):
        raise UserError(
            "Failed to download genome reference {} from {}".format(
                os.path.basename(config.reference_genome),
                config.reference_genome_fileid))
    # params
    params_name = "params.json"
    params_location = os.path.join(work_dir, params_name)
    job.fileStore.readGlobalFile(config.params_fileid, params_location)
    if not os.path.isfile(params_location):
        raise UserError("Failed to download params {} from {}".format(
            os.path.basename(config.params), config.params_fileid))

    # do we want to run cPecan?
    cpecan_prob_location = None
    if config.cpecan_probabilities:
        cpecan_prob_location = run_margin_phase__run_cpecan_alignment(
            job, config, chunk_identifier, work_dir, chunk_name,
            genome_reference_name)

    # run marginPhase
    params = [
        os.path.join("/data", chunk_name),
        os.path.join("/data", genome_reference_name),
        os.path.join("/data", params_name), "-o",
        os.path.join("/data", "{}.out".format(chunk_identifier)), '--tag',
        "{},{}-{}".format(chunk_idx, chunk_info[CI_CHUNK_BOUNDARY_START],
                          chunk_info[CI_CHUNK_BOUNDARY_END])
    ]
    if cpecan_prob_location is not None:
        params.extend([
            '--singleNuclProbDir',
            os.path.join("/data", cpecan_prob_location)
        ])
    docker_call(job, config, work_dir, params, config.margin_phase_image,
                config.margin_phase_tag)
    log_debug_from_docker(job, os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG),
                          chunk_identifier, 'margin_phase',
                          [chunk_location, genome_reference_location])
    log_location = os.path.join(work_dir,
                                "marginPhase.{}.log".format(chunk_identifier))
    os.rename(os.path.join(work_dir, DOCKER_MARGIN_PHASE_LOG), log_location)

    # document output
    log(job, "Output files after marginPhase:", chunk_identifier,
        'run_margin_phase')
    output_file_locations = glob.glob(
        os.path.join(work_dir, "{}*".format(chunk_identifier)))
    output_file_locations.append(log_location)
    found_vcf, found_sam = False, False
    for f in output_file_locations:
        log(job, "\t\t{}".format(os.path.basename(f)), chunk_identifier,
            'run_margin_phase')
        if f.endswith(VCF_SUFFIX): found_vcf = True
        if f.endswith(SAM_UNIFIED_SUFFIX): found_sam = True
    if cpecan_prob_location is not None:
        cpecan_tarball = glob.glob(
            os.path.join(work_dir, cpecan_prob_location, "*.tar.gz"))
        if len(cpecan_tarball) == 0:
            # todo why has tarball_files failed in this location?
            log(job, "Found no cpecan output tarball! Trying alt location.",
                chunk_identifier, 'run_margin_phase')
            cpecan_tarball = glob.glob(os.path.join(work_dir, "*.tar.gz"))

        if len(cpecan_tarball) == 0:
            log(job, "Found no cpecan output tarball!", chunk_identifier,
                'run_margin_phase')
        elif len(cpecan_tarball) > 1:
            log(
                job, "Found {} cpecan output tarballs: {}".format(
                    len(cpecan_tarball), cpecan_tarball), chunk_identifier,
                'run_margin_phase')
        else:
            log(job,
                "Saving cpecan output tarball: {}".format(cpecan_tarball[0]),
                chunk_identifier, 'run_margin_phase')
            output_file_locations.append(cpecan_tarball[0])

    # tarball the output and save
    tarball_name = "{}.tar.gz".format(chunk_identifier)
    tarball_files(tar_name=tarball_name,
                  file_paths=output_file_locations,
                  output_dir=work_dir)

    # validate output, retry if not
    if not (found_sam and found_vcf):
        if "retry_attempts" not in config:
            config.retry_attempts = 1
        else:
            config.retry_attempts += 1
            if config.retry_attempts > MAX_RETRIES:
                log(job, "", chunk_identifier, 'run_margin_phase')
                error = "Failed to generate appropriate output files {} times".format(
                    MAX_RETRIES)
                log(job, error, chunk_identifier, 'run_margin_phase')
                # this enables us to "recover" in the face of failure during a run
                if CONTINUE_AFTER_FAILURE:
                    output_file_id = job.fileStore.writeGlobalFile(
                        os.path.join(work_dir, tarball_name))
                    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id
                    return chunk_info
                raise UserError("{}:{}".format(chunk_identifier, error))

        log(
            job, "Missing output files.  Attepmting retry {}".format(
                config.retry_attempts), chunk_identifier, 'run_margin_phase')
        log(job, "Failed job log file:", chunk_identifier, 'run_margin_phase')
        log(job, "", chunk_identifier, 'run_margin_phase')
        with open(log_location, 'r') as input:
            for line in input:
                log(job, "\t\t{}".format(line.rstrip()), chunk_identifier,
                    'run_margin_phase')

        # new job
        retry_job = job.addChildJobFn(
            run_margin_phase,
            config,
            chunk_file_id,
            chunk_info,
            memory=str(int(config.maxMemory / 1024)) + "K",
            cores=job.cores,
            disk=job.disk)
        # save failed output
        if config.intermediate_file_location is not None:
            tarball_fail_name = "{}.FAILURE.{}.tar.gz".format(
                chunk_identifier, config.retry_attempts)
            os.rename(os.path.join(work_dir, tarball_name),
                      os.path.join(work_dir, tarball_fail_name))
            copy_files(file_paths=[os.path.join(work_dir, tarball_fail_name)],
                       output_dir=config.intermediate_file_location)

        log_generic_job_debug(job,
                              config.uuid,
                              'run_margin_phase',
                              work_dir=work_dir)
        return retry_job.rv()

    # if successfull, save output
    if config.intermediate_file_location is not None:
        copy_files(file_paths=[os.path.join(work_dir, tarball_name)],
                   output_dir=config.intermediate_file_location)
    output_file_id = job.fileStore.writeGlobalFile(
        os.path.join(work_dir, tarball_name))
    chunk_info[CI_OUTPUT_FILE_ID] = output_file_id

    # log
    log_generic_job_debug(job,
                          config.uuid,
                          'run_margin_phase',
                          work_dir=work_dir)
    log_time(job, "run_margin_phase", start, chunk_identifier)
    return chunk_info
Exemple #11
0
def process_sample(job, config, input_tar=None, fastq_ids=None):
    """
    Converts sample.tar(.gz) into a fastq pair (or single fastq if single-ended.)

    :param JobFunctionWrappingJob job: passed automatically by Toil
    :param Namespace config: Argparse Namespace object containing argument inputs
    :param FileID input_tar: fileStoreID of the tarball (if applicable)
    :param list[FileID] fastq_ids: FileStoreIDs of fastq files
    :return: FileStoreID from Cutadapt or from fastqs directly if pipeline was run without Cutadapt option
    :rtype: tuple(FileID, FileID)
    """
    job.fileStore.logToMaster('Processing sample: {}'.format(config.uuid))
    work_dir = job.fileStore.getLocalTempDir()
    processed_r1, processed_r2 = None, None
    # I/O
    if input_tar:
        job.fileStore.readGlobalFile(input_tar,
                                     os.path.join(work_dir, 'sample.tar'))
        tar_path = os.path.join(work_dir, 'sample.tar')
        # Untar File and concat
        subprocess.check_call(['tar', '-xvf', tar_path, '-C', work_dir],
                              stderr=PIPE,
                              stdout=PIPE)
        job.fileStore.deleteGlobalFile(input_tar)
    else:
        ext = '.fq.gz' if config.gz else '.fq'
        for i, fastq_id in enumerate(fastq_ids):
            if i % 2 == 0:
                job.fileStore.readGlobalFile(
                    fastq_id,
                    os.path.join(work_dir, 'Fastq_{}_R1{}'.format(i, ext)))
            else:
                job.fileStore.readGlobalFile(
                    fastq_id,
                    os.path.join(work_dir, 'Fastq_{}_R2{}'.format(i, ext)))
    fastqs = []
    for root, subdir, files in os.walk(work_dir):
        fastqs.extend([os.path.join(root, x) for x in files])
    if config.paired:
        r1, r2 = [], []
        # Pattern convention: Look for "R1" / "R2" in the filename, or "_1" / "_2" before the extension
        pattern = re.compile('(?:^|[._-])(R[12]|[12]\.f)')
        for fastq in sorted(fastqs):
            match = pattern.search(os.path.basename(fastq))
            if not match:
                raise UserError(
                    'FASTQ file name fails to meet required convention for paired reads '
                    '(see documentation). ' + fastq)
            elif '1' in match.group():
                r1.append(fastq)
            elif '2' in match.group():
                r2.append(fastq)
            else:
                assert False, match.group()
        require(
            len(r1) == len(r2),
            'Check fastq names, uneven number of pairs found.\nr1: {}\nr2: {}'.
            format(r1, r2))
        # Concatenate fastqs
        command = 'zcat' if r1[0].endswith('.gz') and r2[0].endswith(
            '.gz') else 'cat'

        # If sample is already a single R1 / R2 fastq
        if command == 'cat' and len(fastqs) == 2:
            processed_r1 = fastqs[0]
            processed_r2 = fastqs[1]
        else:
            with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f1:
                p1 = subprocess.Popen([command] + r1, stdout=f1)
            with open(os.path.join(work_dir, 'R2.fastq'), 'w') as f2:
                p2 = subprocess.Popen([command] + r2, stdout=f2)
            p1.wait()
            p2.wait()
            processed_r1 = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, 'R1.fastq'))
            processed_r2 = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, 'R2.fastq'))
        disk = 2 * (processed_r1.size + processed_r2.size)
    else:
        command = 'zcat' if fastqs[0].endswith('.gz') else 'cat'
        if command == 'cat' and len(fastqs) == 1:
            processed_r1 = fastqs[0]
        else:
            with open(os.path.join(work_dir, 'R1.fastq'), 'w') as f:
                subprocess.check_call([command] + fastqs, stdout=f)
            processed_r1 = job.fileStore.writeGlobalFile(
                os.path.join(work_dir, 'R1.fastq'))
        disk = 2 * processed_r1.size
    # Start cutadapt step
    if config.cutadapt:
        return job.addChildJobFn(run_cutadapt,
                                 processed_r1,
                                 processed_r2,
                                 config.fwd_3pr_adapter,
                                 config.rev_3pr_adapter,
                                 disk=disk).rv()
    else:
        return processed_r1, processed_r2
    def run(cls, name, desc):
        """
        Prepares and runs the pipeline. Note this method must be invoked both from inside a
        Docker container and while the docker daemon is reachable.

        :param str name: The name of the command to start the workflow.
        :param str desc: The description of the workflow.
        """
        wrapper = cls(name, desc)
        mount_path = wrapper._get_mount_path()
        # prepare parser
        arg_parser = wrapper._create_argument_parser()
        wrapper._extend_argument_parser(arg_parser)
        # prepare config file
        empty_config = wrapper.__get_empty_config()
        config_yaml = ruamel.yaml.load(empty_config)
        wrapper.__populate_parser_from_config(arg_parser, config_yaml)
        args = arg_parser.parse_args()
        for k, v in vars(args).items():
            k = k.replace('_', '-')
            if k in config_yaml:
                config_yaml[k] = v
        config_path = wrapper._get_config_path()
        with open(config_path, 'w') as writable:
            ruamel.yaml.dump(config_yaml, stream=writable)
        # prepare workdir
        workdir_path = os.path.join(mount_path, 'Toil-' + wrapper._name)
        if os.path.exists(workdir_path):
            if args.restart:
                log.info('Reusing temporary directory: %s', workdir_path)
            else:
                raise UserError(
                    'Temporary directory {} already exists. Run with --restart '
                    'option or remove directory.'.format(workdir_path))
        else:
            os.makedirs(workdir_path)
            log.info('Temporary directory created: %s', workdir_path)

        command = wrapper._create_pipeline_command(args, workdir_path,
                                                   config_path)
        wrapper._extend_pipeline_command(command, args)
        # run command
        try:
            subprocess.check_call(command)
        except subprocess.CalledProcessError as e:
            print(e, file=sys.stderr)
        finally:
            stat = os.stat(mount_path)
            log.info(
                'Pipeline terminated, changing ownership of output files in %s from root to '
                'uid %s and gid %s.', mount_path, stat.st_uid, stat.st_gid)
            chown_command = [
                'chown', '-R',
                '%s:%s' % (stat.st_uid, stat.st_gid), mount_path
            ]
            subprocess.check_call(chown_command)
            if args.no_clean:
                log.info(
                    'Flag "--no-clean" was used, therefore %s was not deleted.',
                    workdir_path)
            else:
                log.info('Cleaning up temporary directory: %s', workdir_path)
                shutil.rmtree(workdir_path)
    def _compare_output(self, work_dir, identifier, docker_vcf_name,
                        toil_vcf_name):
        # prep - get required files
        shutil.copy(
            os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_VCF),
            os.path.join(work_dir, MarginPhaseTest.IN_REF_VCF))
        shutil.copy(os.path.join(MARGIN_PHASE_TEST, MarginPhaseTest.IN_REF_FA),
                    os.path.join(work_dir, MarginPhaseTest.IN_REF_FA))
        reference_sdf_name = "SDF"

        #bgzip
        vcf_bgzip_command = [
            'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir),
            MarginPhaseTest.DOCKER_RTG_TOOLS, "bgzip",
            "/data/{}".format(docker_vcf_name),
            "/data/{}".format(toil_vcf_name),
            "/data/{}".format(MarginPhaseTest.IN_REF_VCF)
        ]
        log.info('Running %r', vcf_bgzip_command)
        subprocess.check_call(vcf_bgzip_command)

        #index
        vcf_index_command = [
            'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir),
            MarginPhaseTest.DOCKER_RTG_TOOLS, "index",
            "/data/{}.gz".format(docker_vcf_name),
            "/data/{}.gz".format(toil_vcf_name),
            "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF)
        ]
        log.info('Running %r', vcf_index_command)
        subprocess.check_call(vcf_index_command)

        #sdf
        ref_sdf_command = [
            'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir),
            MarginPhaseTest.DOCKER_RTG_TOOLS, "format", "-o",
            "/data/{}".format(reference_sdf_name),
            "/data/{}".format(MarginPhaseTest.IN_REF_FA)
        ]
        log.info('Running %r', ref_sdf_command)
        subprocess.check_call(ref_sdf_command)

        # vcf eval prep
        toil_to_docker_eval_identifier = "{}-vcfeval_t2d".format(identifier)
        toil_to_ref_eval_identifier = "{}-vcfeval_t2r".format(identifier)
        docker_to_ref_eval_identifier = "{}-vcfeval_d2r".format(identifier)
        vcf_eval_base = [
            'docker', 'run', '--rm', '-v', "{}:/data".format(work_dir),
            MarginPhaseTest.DOCKER_RTG_TOOLS, "vcfeval", "-t",
            os.path.join("/data", reference_sdf_name)
        ]

        # EVAL: toil to docker
        vcf_eval_command = list(vcf_eval_base)
        vcf_eval_command.extend([
            "-o",
            os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp",
                         toil_to_docker_eval_identifier), "-b",
            "/data/{}.gz".format(docker_vcf_name), "-c",
            "/data/{}.gz".format(toil_vcf_name)
        ])
        log.info('Running %r', vcf_eval_command)
        t2d_vcf_eval_output = subprocess.check_output(vcf_eval_command)
        if MarginPhaseTest.DEBUG:
            shutil.copytree(
                os.path.join(work_dir, toil_to_docker_eval_identifier),
                os.path.join(TOIL_TEST_STORAGE_DIR,
                             toil_to_docker_eval_identifier))

        # EVAL: toil to reference
        vcf_eval_command = list(vcf_eval_base)
        vcf_eval_command.extend([
            "-o",
            os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp",
                         toil_to_ref_eval_identifier), "-b",
            "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF), "-c",
            "/data/{}.gz".format(toil_vcf_name)
        ])
        log.info('Running %r', vcf_eval_command)
        t2r_vcf_eval_output = subprocess.check_output(vcf_eval_command)
        if MarginPhaseTest.DEBUG:
            shutil.copytree(
                os.path.join(work_dir, toil_to_ref_eval_identifier),
                os.path.join(TOIL_TEST_STORAGE_DIR,
                             toil_to_ref_eval_identifier))

        # EVAL: docker to reference
        vcf_eval_command = list(vcf_eval_base)
        vcf_eval_command.extend([
            "-o",
            os.path.join("/data" if MarginPhaseTest.DEBUG else "/tmp",
                         docker_to_ref_eval_identifier), "-b",
            "/data/{}.gz".format(MarginPhaseTest.IN_REF_VCF), "-c",
            "/data/{}.gz".format(docker_vcf_name)
        ])
        log.info('Running %r', vcf_eval_command)
        d2r_vcf_eval_output = subprocess.check_output(vcf_eval_command)
        if MarginPhaseTest.DEBUG:
            shutil.copytree(
                os.path.join(work_dir, docker_to_ref_eval_identifier),
                os.path.join(TOIL_TEST_STORAGE_DIR,
                             docker_to_ref_eval_identifier))

        # now we analyze docker and toil as compared to the reference
        t2r_vcf_eval = t2d_vcf_eval_output.split("\n")
        d2r_vcf_eval = d2r_vcf_eval_output.split("\n")
        if len(t2r_vcf_eval) < 3 or len(d2r_vcf_eval) < 3:
            raise UserError(
                "Incorrect format for vcf eval output: len {}/{} (expected 3)".
                format(len(t2r_vcf_eval), len(d2r_vcf_eval)))
        header = t2r_vcf_eval[0].split()
        precision_idx = None
        sensitivity_idx = None
        idx = 0
        while idx < len(header):
            if header[idx] == "Precision":
                precision_idx = idx
            if header[idx] == "Sensitivity":
                sensitivity_idx = idx
            idx += 1
        t2r_precision = float(t2r_vcf_eval[2].split()[precision_idx])
        t2r_sensitivity = float(t2r_vcf_eval[2].split()[sensitivity_idx])
        d2r_precision = float(d2r_vcf_eval[2].split()[precision_idx])
        d2r_sensitivity = float(d2r_vcf_eval[2].split()[sensitivity_idx])

        precision_diff = abs(t2r_precision - d2r_precision)
        sensitivity_diff = abs(t2r_sensitivity - d2r_sensitivity)
        if precision_diff > MarginPhaseTest.ACCEPTABLE_PRECISION_DIFFERENCE \
                or sensitivity_diff > MarginPhaseTest.ACCEPTABLE_SENSITIVITY_DIFFERENCE:
            self.fail((
                "Toil and Docker marginPhase runs have unacceptable difference when compared to the reference:\n"
                "\tPRECISION  \tToil:%5f\tDocker:%5f\tDiff:%5f\n"
                "\tSENSITIVITY\tToil:%5f\tDocker:%5f\tDiff:%5f") %
                      (t2r_precision, d2r_precision, precision_diff,
                       t2r_sensitivity, d2r_sensitivity, sensitivity_diff))

        return "\nTOIL to DOCKER:\n{}\nTOIL to REFERENCE:\n{}\nDOCKER to REFERENCE:\n{}".format(
            t2d_vcf_eval_output, t2r_vcf_eval_output, d2r_vcf_eval_output)