Esempio n. 1
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    qual_format = config["algorithm"].get("quality_format", "").lower()
    qual_flag = "ILMFQ" if qual_format == "illumina" else "STDFQ"
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -F {qual_flag} -c {num_cores} "
                       "| {samtools} view -b -S -u - "
                       "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}")
                cmd = cmd.format(**locals())
                log_cmd("Novoalign: %s" % names["sample"], None, cmd)
                subprocess.check_call(cmd, shell=True)
    return out_file
Esempio n. 2
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novosort = config_utils.get_program("novosort", config)
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                rg_info = get_rg_info(names)
                cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 "
                       " -n -t {work_dir} {in_bam} "
                       "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                       "  -F BAMPE -c {num_cores} {extra_novo_args} "
                       "| {samtools} view -b -S -u - "
                       "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} "
                       "  -o {tx_out_file} /dev/stdin")
                cmd = cmd.format(**locals())
                log_cmd("Novoalign: %s" % names["sample"], None, cmd)
                subprocess.check_call(cmd, shell=True)
    return out_file
Esempio n. 3
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    samtools = config_utils.get_program("samtools", config)
    novoalign = config_utils.get_program("novoalign", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file):
        with utils.curdir_tmpdir() as work_dir:
            with file_transaction(out_file) as tx_out_file:
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = (
                    "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                    "  -c {num_cores} {extra_novo_args} "
                    "| {samtools} view -b -S -u - "
                    "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}"
                )
                cmd = cmd.format(**locals())
                log_cmd("Novoalign: %s" % names["sample"], None, cmd)
                subprocess.check_call(cmd, shell=True)
    return out_file
Esempio n. 4
0
def align_bam(in_bam, ref_file, names, align_dir, config):
    """Perform realignment of input BAM file, handling sorting of input/output with novosort.

    Uses unix pipes for avoid IO writing between steps:
      - novosort of input BAM to coordinates
      - alignment with novoalign
      - conversion to BAM with samtools
      - coordinate sorting with novosort
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    novosort = config_utils.get_program("novosort", config)
    novoalign = config_utils.get_program("novoalign", config)
    samtools = config_utils.get_program("samtools", config)
    resources = config_utils.get_resources("novoalign", config)
    num_cores = config["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "4G")
    extra_novo_args = " ".join(_novoalign_args_from_config(config, False))

    if not file_exists(out_file):
        with curdir_tmpdir(base_dir=align_dir) as work_dir:
            with file_transaction(out_file) as tx_out_file:
                rg_info = get_rg_info(names)
                cmd = (
                    "{novosort} -c {num_cores} -m {max_mem} --compression 0 "
                    " -n -t {work_dir} {in_bam} "
                    "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin "
                    "  -F BAMPE -c {num_cores} {extra_novo_args} "
                    "| {samtools} view -b -S -u - "
                    "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} "
                    "  -o {tx_out_file} /dev/stdin")
                cmd = cmd.format(**locals())
                log_cmd("Novoalign: %s" % names["sample"], None, cmd)
                subprocess.check_call(cmd, shell=True)
    return out_file
Esempio n. 5
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar("bcbio.variation",
                                  config_utils.get_program("bcbio_variation",
                                                           data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-compare", config_file]
    log_cmd("Comparing variant calls using bcbio.variation", data, " ".join(cmd))
    subprocess.check_call(cmd)
Esempio n. 6
0
def bcbio_variation_comparison(config_file, base_dir, data):
    """Run a variant comparison using the bcbio.variation toolkit, given an input configuration.
    """
    tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp"))
    bv_jar = config_utils.get_jar(
        "bcbio.variation",
        config_utils.get_program("bcbio_variation", data["config"], "dir"))
    resources = config_utils.get_resources("bcbio_variation", data["config"])
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + [
        "-jar", bv_jar, "variant-compare", config_file
    ]
    log_cmd("Comparing variant calls using bcbio.variation", data,
            " ".join(cmd))
    subprocess.check_call(cmd)
Esempio n. 7
0
def prep_gemini_db(fnames, call_id, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    if not utils.file_exists(gemini_db):
        if len(fnames) > 1:
            gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
            gemini_vcf = genotype.combine_variant_files(fnames, gemini_vcf, data["sam_ref"],
                                                        data["config"])
        else:
            gemini_vcf = fnames[0]
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            log_cmd("Create gemini database for %s" % str(call_id), data["info"]["provenance"], cmd)
            subprocess.check_call(cmd, shell=True)
    return [[call_id, gemini_db]]
Esempio n. 8
0
def prep_gemini_db(fnames, call_id, data):
    """Prepare a gemini database from VCF inputs prepared with snpEff.
    """
    out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini"))
    gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db")
    if not utils.file_exists(gemini_db):
        if len(fnames) > 1:
            gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0]
            gemini_vcf = genotype.combine_variant_files(
                fnames, gemini_vcf, data["sam_ref"], data["config"])
        else:
            gemini_vcf = fnames[0]
        with file_transaction(gemini_db) as tx_gemini_db:
            gemini = config_utils.get_program("gemini", data["config"])
            num_cores = data["config"]["algorithm"].get("num_cores", 1)
            cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}"
            cmd = cmd.format(**locals())
            log_cmd("Create gemini database for %s" % str(call_id),
                    data["info"]["provenance"], cmd)
            subprocess.check_call(cmd, shell=True)
    return [[call_id, gemini_db]]