Example #1
0
def _get_star_dirnames(align_dir, data, names):
    ALIGNED_OUT_FILE = "Aligned.out.sam"
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + ALIGNED_OUT_FILE
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    return StarOutDirs(out_dir, out_file, out_prefix, final_out)
Example #2
0
def _get_star_dirnames(align_dir, data, names):
    ALIGNED_OUT_FILE = "Aligned.out.sam"
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + ALIGNED_OUT_FILE
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))
    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    return StarOutDirs(out_dir, out_file, out_prefix, final_out)
Example #3
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error("bismark index not found. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
        return data

    bismark = config_utils.get_program("bismark", config)
    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = dd.get_num_cores(data)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0)
    instances = calculate_bismark_instances(max_cores, max_mem * max_cores)
    # override instances if specified in the config
    if resources and resources.get("bismark_threads"):
        instances = resources.get("bismark_threads")
        logger.info(f"Using {instances} bismark instances - overriden by resources")
    bowtie_threads = 1
    if resources and resources.get("bowtie_threads"):
        bowtie_threads = resources.get("bowtie_threads")
    logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance")
    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} "
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    # don't process bam in the bismark pipeline!
    utils.symlink_plus(raw_bam[0], final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    data = dd.update_summary_qc(data, "bismark", base=data["bam_report"])
    return data
Example #4
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith("smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else ""
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                "--chimScoreSeparation 5 "
                "--chimOutType WithinSAM ")
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if not srna:
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Example #5
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith(
        "wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "bismark index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        return data

    bismark = config_utils.get_program("bismark", config)

    # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38)
    resources = config_utils.get_resources("bismark", data["config"])
    max_cores = resources.get("cores", 1)
    max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G"))
    n = min(max(int(max_cores / 5), 1),
            max(int(max_mem / config_utils.convert_to_bytes("12G")), 1))

    kit = kits.KITS.get(dd.get_kit(data), None)
    directional = "--non_directional" if kit and not kit.is_directional else ""

    other_opts = resources.get("options", [])
    other_opts = " ".join([str(x) for x in other_opts]).strip()

    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    safe_makedir(align_dir)
    cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}"
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file,
                                                                    ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    process_bam = _process_bam(raw_bam[0], fastq_files, sample,
                               dd.get_sam_ref(data), config)
    utils.symlink_plus(process_bam, final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    return data
Example #6
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" %
           " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Example #7
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error("STAR index not found. We don't provide the STAR indexes "
                     "by default because they are very large. You can install "
                     "the index for your genome with: bcbio_nextgen.py upgrade "
                     "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data

    star_path = config_utils.get_program("STAR", config)
    fastq = " ".join([fastq_file, pair_file]) if pair_file else fastq_file
    num_cores = config["algorithm"].get("num_cores", 1)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax 10 "
           "--outStd SAM "
           "--outSAMunmapped Within --outSAMattributes %s" % " ".join(ALIGN_TAGS))
    cmd = cmd + " --readFilesCommand zcat " if is_gzipped(fastq_file) else cmd
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"), False)
    if fusion_mode:
        cmd += " --chimSegmentMin 15 --chimJunctionOverhangMin 15"
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded":
        cmd += " --outSAMstrandField intronMotif "

    if dd.get_rsem(data) and not is_transcriptome_broken():
        cmd += " --quantMode TranscriptomeSAM "

    with tx_tmpdir(data) as tmp_dir:
        sam_to_bam = bam.sam_to_bam_stream_cmd(config)
        sort = bam.sort_cmd(config, tmp_dir)
        cmd += "| {sam_to_bam} | {sort} -o {tx_final_out} "
        run_message = "Running STAR aligner on %s and %s" % (fastq_file, ref_file)
        with file_transaction(data, final_out) as tx_final_out:
            do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data
Example #8
0
 def __init__(self, data):
     self._db_location = self._get_ericscript_db(data)
     self._sample_name = dd.get_lane(data)
     self._work_dir = dd.get_work_dir(data)
     self._env = None
     self._output_dir = None
     self._sample_out_dir = None
Example #9
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment"
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    ref_file = dd.get_sam_ref(data)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        return data

    bsmap = config_utils.get_program("bsmap", config)
    fastq_files = " -a %s" % fastq_file
    num_cores = dd.get_num_cores(data)
    num_cores = "-p %d" % num_cores
    safe_makedir(align_dir)
    cmd = "{bsmap} {num_cores} -w 100 -v 0.07 -m 10 -x 300 -o {tx_out_bam} -d {ref_file} {fastq_files}"
    if pair_file:
        fastq_files = "-a %s -b %s" % (fastq_file, pair_file)
    if not final_out:
        with file_transaction(final_out) as tx_out_bam:
            run_message = "Running BSMAP aligner on %s and %s" % (fastq_file, ref_file)
            do.run(cmd.format(**locals()), run_message, None)
    data = dd.set_work_bam(data, final_out)
    return data
Example #10
0
 def __init__(self, data):
     self._db_location = self._get_ericscript_db(data)
     self._sample_name = dd.get_lane(data)
     self._work_dir = dd.get_work_dir(data)
     self._env = None
     self._output_dir = None
     self._sample_out_dir = None
Example #11
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = (
        "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
        "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks/stringtie can use
    if dd.get_transcript_assembler(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data).lower() == "rna-seq":
        gtf_file = dd.get_gtf_file(data)
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
           "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks can use
    if dd.get_assemble_transcripts(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data) == "rna-seq":
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
Example #13
0
def test_get_star_dirnames(data, names):
    align_dir = "/path/to/align/dir"
    lane = dd.get_lane(data)
    result = _get_star_dirnames(align_dir, data, names)
    assert result.out_dir == "/path/to/align/dir/%s_star" % lane
    assert result.out_prefix == "/path/to/align/dir/%s" % lane
    assert result.out_file == "/path/to/align/dir/%sAligned.out.sam" % lane
    assert result.final_out == "/path/to/align/dir/%s_star/%s.bam" % (lane, names["sample"])
Example #14
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith(
        "wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "bismark index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        return data

    bismark = config_utils.get_program("bismark", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    n = 1 if num_cores < 5 else 2
    safe_makedir(align_dir)
    cmd = "{bismark} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}"
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file,
                                                                    ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    process_bam = _process_bam(raw_bam[0], fastq_files, sample,
                               dd.get_sam_ref(data), config)
    utils.symlink_plus(process_bam, final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    return data
Example #15
0
def test_get_star_dirnames(data, names):
    align_dir = '/path/to/align/dir'
    lane = dd.get_lane(data)
    result = _get_star_dirnames(align_dir, data, names)
    assert result.out_dir == '/path/to/align/dir/%s_star' % lane
    assert result.out_prefix == '/path/to/align/dir/%s' % lane
    assert result.out_file == '/path/to/align/dir/%sAligned.out.sam' % lane
    assert result.final_out == '/path/to/align/dir/%s_star/%s.bam' % (
        lane, names['sample'])
Example #16
0
def test_get_star_dirnames(data, names):
    from bcbio.ngsalign.star import _get_star_dirnames
    align_dir = '/path/to/align/dir'
    lane = dd.get_lane(data)
    result = star._get_star_dirnames(align_dir, data, names)
    assert result.out_dir == '/path/to/align/dir/%s_star' % lane
    assert result.out_prefix == '/path/to/align/dir/%s' % lane
    assert result.out_file == '/path/to/align/dir/%sAligned.out.sam' % lane
    assert result.final_out == '/path/to/align/dir/%s_star/%s.bam' % (
        lane, names['sample'])
Example #17
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    star_dirs = _get_star_dirnames(align_dir, data, names)
    if file_exists(star_dirs.final_out):
        data = _update_data(star_dirs.final_out, star_dirs.out_dir, names,
                            data)
        out_log_file = os.path.join(align_dir,
                                    dd.get_lane(data) + "Log.final.out")
        data = dd.update_summary_qc(data, "star", base=out_log_file)
        return data

    star_path = config_utils.get_program("STAR", config)

    def _unpack_fastq(f):
        """Use process substitution instead of readFilesCommand for gzipped inputs.

        Prevents issues on shared filesystems that don't support FIFO:
        https://github.com/alexdobin/STAR/issues/143
        """
        if f and is_gzipped(f):
            return "<(gunzip -c %s)" % f
        else:
            return f

    fastq_files = (" ".join([
        _unpack_fastq(fastq_file),
        _unpack_fastq(pair_file)
    ]) if pair_file else _unpack_fastq(fastq_file))
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_transcriptome_gtf(data)
    if not gtf_file:
        gtf_file = dd.get_gtf_file(data)
    if ref_file.endswith("chrLength"):
        ref_file = os.path.dirname(ref_file)

    if index_has_alts(ref_file):
        logger.error(
            "STAR is being run on an index with ALTs which STAR is not "
            "designed for. Please remake your STAR index or use an ALT-aware "
            "aligner like hisat2")
        sys.exit(1)
    with file_transaction(data, align_dir) as tx_align_dir:
        tx_1pass_dir = tx_align_dir + "1pass"
        tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_1pass_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            if "arriba" in dd.get_fusion_caller(data):
                cmd += (
                    "--chimSegmentMin 10 --chimOutType WithinBAM "
                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
                    "--chimSegmentReadGapMax 3 "
                    "--peOverlapNbasesMin 10 "
                    "--alignSplicedMateMapLminOverLmate 0.5 ")
            else:
                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                        "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                        "--chimScoreSeparation 5 ")
                if "oncofuse" in dd.get_fusion_caller(data):
                    cmd += "--chimOutType Junctions "
                else:
                    cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running 1st pass of STAR aligner on %s and %s" % (
            fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

        sjfile = get_splicejunction_file(tx_out_dir, data)
        sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else ""
        tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names)
        tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames
        safe_makedir(tx_align_dir)
        safe_makedir(tx_out_dir)
        cmd = (
            "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
            "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} "
            "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
            "--outStd BAM_Unsorted {srna_opts} "
            "--limitOutSJcollapsed 2000000 "
            "{sjflag} "
            "--outSAMtype BAM Unsorted "
            "--outSAMmapqUnique 60 "
            "--outSAMunmapped Within --outSAMattributes %s " %
            " ".join(ALIGN_TAGS))
        cmd += _add_sj_index_commands(fastq_file, ref_file,
                                      gtf_file) if not srna else ""
        cmd += _read_group_option(names)
        if dd.get_fusion_caller(data):
            if "arriba" in dd.get_fusion_caller(data):
                cmd += (
                    "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions "
                    "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 "
                    "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 "
                    "--alignSJstitchMismatchNmax 5 -1 5 5 "
                    "--chimSegmentReadGapMax 3 ")
            else:
                cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                        "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                        "--chimScoreSeparation 5 ")
                if "oncofuse" in dd.get_fusion_caller(data):
                    cmd += "--chimOutType Junctions "
                else:
                    cmd += "--chimOutType WithinBAM "
        strandedness = utils.get_in(data,
                                    ("config", "algorithm", "strandedness"),
                                    "unstranded").lower()
        if strandedness == "unstranded" and not srna:
            cmd += " --outSAMstrandField intronMotif "
        if not srna:
            cmd += " --quantMode TranscriptomeSAM "

        resources = config_utils.get_resources("star", data["config"])
        if resources.get("options", []):
            cmd += " " + " ".join(
                [str(x) for x in resources.get("options", [])])
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        cmd += " > {tx_final_out} "
        run_message = "Running 2nd pass of STAR aligner on %s and %s" % (
            fastq_file, ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data)
    out_log_file = os.path.join(align_dir, dd.get_lane(data) + "Log.final.out")
    data = dd.update_summary_qc(data, "star", base=out_log_file)
    return data
Example #18
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    max_hits = 10
    srna = True if data["analysis"].lower().startswith(
        "smallrna-seq") else False
    srna_opts = ""
    if srna:
        max_hits = 1000
        srna_opts = "--alignIntronMax 1"
    config = data["config"]
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_file = out_prefix + "Aligned.out.sam"
    out_dir = os.path.join(align_dir, "%s_star" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "STAR index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners star --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"]))
    if file_exists(final_out):
        data = _update_data(final_out, out_dir, names, data)
        return data
    star_path = config_utils.get_program("STAR", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    gtf_file = dd.get_gtf_file(data)

    safe_makedir(align_dir)
    cmd = ("{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} "
           "--runThreadN {num_cores} --outFileNamePrefix {out_prefix} "
           "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} "
           "--outStd SAM {srna_opts} "
           "--outSAMunmapped Within --outSAMattributes %s " %
           " ".join(ALIGN_TAGS))
    cmd += _add_sj_index_commands(fastq_file, ref_file,
                                  gtf_file) if not srna else ""
    cmd += " --readFilesCommand zcat " if is_gzipped(fastq_file) else ""
    cmd += _read_group_option(names)
    fusion_mode = utils.get_in(data, ("config", "algorithm", "fusion_mode"),
                               False)
    if fusion_mode:
        cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 "
                "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 "
                "--chimScoreSeparation 5 "
                "--chimOutType WithinSAM ")
    strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"),
                                "unstranded").lower()
    if strandedness == "unstranded" and not srna:
        cmd += " --outSAMstrandField intronMotif "

    if not srna:
        cmd += " --quantMode TranscriptomeSAM "

    with file_transaction(data, final_out) as tx_final_out:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out)
        run_message = "Running STAR aligner on %s and %s" % (fastq_file,
                                                             ref_file)
        do.run(cmd.format(**locals()), run_message, None)

    data = _update_data(final_out, out_dir, names, data)
    return data