예제 #1
0
def compress(samples, run_parallel):
    """Perform compression of output files for long term storage.
    """
    to_cram = []
    finished = []
    for data in [x[0] for x in samples]:
        if "cram" in dd.get_archive(data) or "cram-lossless" in dd.get_archive(data):
            to_cram.append([data])
        else:
            finished.append([data])
    crammed = run_parallel("archive_to_cram", to_cram)
    return finished + crammed
예제 #2
0
def compress(samples, run_parallel):
    """Perform compression of output files for long term storage.
    """
    to_cram = []
    finished = []
    for data in [x[0] for x in samples]:
        if "cram" in dd.get_archive(data) or "cram-lossless" in dd.get_archive(
                data):
            to_cram.append([data])
        else:
            finished.append([data])
    crammed = run_parallel("archive_to_cram", to_cram)
    return finished + crammed
예제 #3
0
def compress(in_bam, data):
    """Compress a BAM file to CRAM, providing indexed CRAM file.

    Does 8 bin compression of quality score and read name removal
    using Staden io_lib if `cram` specified:

    https://github.com/jkbonfield/io_lib

    Otherwise does `cram-lossless` which only converts to CRAM.
    """
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "archive"))
    out_file = os.path.join(out_dir, "%s.cram" % os.path.splitext(os.path.basename(in_bam))[0])
    cores = dd.get_num_cores(data)
    ref_file = dd.get_ref_file(data)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            compress_type = dd.get_archive(data)
            scramble = config_utils.get_program("scramble", data["config"])
            cmd = [scramble, "-I", "bam", "-O", "cram", "-9", "-X", "archive", "-r", ref_file, "-V", "3.0", "-t", cores,
                   in_bam, tx_out_file]
            compressed = False
            if "cram" in compress_type:
                try:
                    cmd.extend(["-B", "-n"])
                    do.run(cmd, "Compress BAM to CRAM: quality score binning")
                    compressed = True
                except subprocess.CalledProcessError:
                    pass
            if not compressed:
                do.run(cmd, "Compress BAM to CRAM: lossless")
    index(out_file, data["config"])
    return out_file
예제 #4
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
예제 #5
0
def _variant_checkpoints(samples):
    """Check sample configuration to identify required steps in analysis.
    """
    checkpoints = {}
    checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples])
    checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples])
    checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d))
                                  for d in samples])
    checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples])
    checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples])
    checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or
                                           not dd.get_aligner(d))
                                          for d in samples])
    checkpoints["archive"] = any([dd.get_archive(d) for d in samples])
    checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples])
    checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples])
    checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples)
    return checkpoints
예제 #6
0
def compress(in_bam, data):
    """Compress a BAM file to CRAM, providing indexed CRAM file.

    Does 8 bin compression of quality score and read name removal
    using bamUtils squeeze if `cram` specified:

    http://genome.sph.umich.edu/wiki/BamUtil:_squeeze

    Otherwise does `cram-lossless` which only converts to CRAM.
    """
    out_file = "%s.cram" % os.path.splitext(in_bam)[0]
    cores = dd.get_num_cores(data)
    ref_file = dd.get_ref_file(data)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            compress_type = dd.get_archive(data)
            samtools = config_utils.get_program("samtools", data["config"])
            try:
                bam_cmd = config_utils.get_program("bam", data["config"])
            except config_utils.CmdNotFound:
                bam_cmd = None
            to_cram = ("{samtools} view -T {ref_file} -@ {cores} "
                       "-C -x BD -x BI -o {tx_out_file}")
            compressed = False
            if "cram" in compress_type and bam_cmd:
                try:
                    cmd = (
                        "{bam_cmd} squeeze --in {in_bam} --out -.ubam --keepDups "
                        "--binQualS=2,10,20,25,30,35,70 --binMid | " + to_cram)
                    do.run(cmd.format(**locals()),
                           "Compress BAM to CRAM: quality score binning")
                    compressed = True
                # Retry failures avoiding using bam squeeze which can cause issues
                except subprocess.CalledProcessError:
                    pass
            if not compressed:
                cmd = (to_cram + " {in_bam}")
                do.run(cmd.format(**locals()),
                       "Compress BAM to CRAM: lossless")
    index(out_file, data["config"])
    return out_file
예제 #7
0
def compress(in_bam, data):
    """Compress a BAM file to CRAM, providing indexed CRAM file.

    Does 8 bin compression of quality score and read name removal
    using bamUtils squeeze if `cram` specified:

    http://genome.sph.umich.edu/wiki/BamUtil:_squeeze

    Otherwise does `cram-lossless` which only converts to CRAM.
    """
    out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "archive"))
    out_file = os.path.join(out_dir, "%s.cram" % os.path.splitext(os.path.basename(in_bam))[0])
    cores = dd.get_num_cores(data)
    ref_file = dd.get_ref_file(data)
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            compress_type = dd.get_archive(data)
            samtools = config_utils.get_program("samtools", data["config"])
            try:
                bam_cmd = config_utils.get_program("bam", data["config"])
            except config_utils.CmdNotFound:
                bam_cmd = None
            to_cram = ("{samtools} view -T {ref_file} -@ {cores} "
                       "-C -x BD -x BI -o {tx_out_file}")
            compressed = False
            if "cram" in compress_type and bam_cmd:
                try:
                    cmd = ("{bam_cmd} squeeze --in {in_bam} --out -.ubam --keepDups "
                           "--binQualS=2,10,20,25,30,35,70 --binMid | " + to_cram)
                    do.run(cmd.format(**locals()), "Compress BAM to CRAM: quality score binning")
                    compressed = True
                # Retry failures avoiding using bam squeeze which can cause issues
                except subprocess.CalledProcessError:
                    pass
            if not compressed:
                cmd = (to_cram + " {in_bam}")
                do.run(cmd.format(**locals()), "Compress BAM to CRAM: lossless")
    index(out_file, data["config"])
    return out_file