def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") qual_format = config["algorithm"].get("quality_format", "").lower() qual_flag = "ILMFQ" if qual_format == "illumina" else "STDFQ" rg_info = get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -F {qual_flag} -c {num_cores} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = get_rg_info(names) cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) rg_info = get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ( "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = get_rg_info(names) cmd = ( "{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar("bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + ["-jar", bv_jar, "variant-compare", config_file] log_cmd("Comparing variant calls using bcbio.variation", data, " ".join(cmd)) subprocess.check_call(cmd)
def bcbio_variation_comparison(config_file, base_dir, data): """Run a variant comparison using the bcbio.variation toolkit, given an input configuration. """ tmp_dir = utils.safe_makedir(os.path.join(base_dir, "tmp")) bv_jar = config_utils.get_jar( "bcbio.variation", config_utils.get_program("bcbio_variation", data["config"], "dir")) resources = config_utils.get_resources("bcbio_variation", data["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]) java_args = ["-Djava.io.tmpdir=%s" % tmp_dir] cmd = ["java"] + jvm_opts + java_args + [ "-jar", bv_jar, "variant-compare", config_file ] log_cmd("Comparing variant calls using bcbio.variation", data, " ".join(cmd)) subprocess.check_call(cmd)
def prep_gemini_db(fnames, call_id, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") if not utils.file_exists(gemini_db): if len(fnames) > 1: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = genotype.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) log_cmd("Create gemini database for %s" % str(call_id), data["info"]["provenance"], cmd) subprocess.check_call(cmd, shell=True) return [[call_id, gemini_db]]
def prep_gemini_db(fnames, call_id, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") if not utils.file_exists(gemini_db): if len(fnames) > 1: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = genotype.combine_variant_files( fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) log_cmd("Create gemini database for %s" % str(call_id), data["info"]["provenance"], cmd) subprocess.check_call(cmd, shell=True) return [[call_id, gemini_db]]