Esempio n. 1
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Alignment with bowtie2.
    """
    config = data["config"]
    analysis_config = ANALYSIS.get(data["analysis"].lower())
    assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"])
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie2", config)]
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-x", ref_file]
            cl += analysis_config.get("params", [])
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += ["-U", fastq_file]
            if names and "rg" in names:
                cl += ["--rg-id", names["rg"]]
                for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]:
                    if names.get(key):
                        cl += ["--rg", "%s:%s" % (tag, names[key])]
            cl += _bowtie2_args_from_config(config, cl)
            cl = [str(i) for i in cl]
            cmd = "unset JAVA_HOME && " + " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file))
    return out_file
Esempio n. 2
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            index_file = None
            # Skip trying to use indices now as they provide only slight speed-ups
            # and give inconsitent outputs in BAM headers
            # If a single index present, index_dir points to that
            # if index_dir and os.path.isfile(index_dir):
            #     index_dir = os.path.dirname(index_dir)
            #     index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Esempio n. 3
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data) and
              ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file,
                                        names, rg_info, data)
        else:
            out_file = _align_mem(fastq_file, pair_file, ref_file, out_file,
                                  names, rg_info, data)
    data["work_bam"] = out_file
    return data
Esempio n. 4
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if not utils.file_exists(out_file):
        out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("unset JAVA_HOME && "
                       "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = (cmd + tobam_cl).format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Esempio n. 5
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Alignment with bowtie2.
    """
    config = data["config"]
    analysis_config = ANALYSIS.get(data["analysis"].lower())
    assert analysis_config, "Analysis %s is not supported by bowtie2" % (data["analysis"])
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie2", config)]
            cl += _bowtie2_args_from_config(config)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-x", ref_file]
            cl += analysis_config.get("params", [])
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += ["-U", fastq_file]
            if names and "rg" in names:
                cl += ["--rg-id", names["rg"]]
                for key, tag in [("sample", "SM"), ("pl", "PL"), ("pu", "PU"), ("lb", "LB")]:
                    if names.get(key):
                        cl += ["--rg", "%s:%s" % (tag, names[key])]
            cl = [str(i) for i in cl]
            cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Aligning %s and %s with Bowtie2." % (fastq_file, pair_file))
    return out_file
Esempio n. 6
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.
    """
    umi_ext = "-cumi" if "umi_bam" in data else ""
    out_file = os.path.join(align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data), umi_ext))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    rg_info = novoalign.get_rg_info(names)
    preset = "sr"

    pair_file = pair_file if pair_file else ""
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
            # If a single index present, index_dir points to that
            index_file = None
            if index_dir and os.path.isfile(index_dir):
                index_dir = os.path.dirname(index_dir)
                index_file = os.path.join(index_dir, "%s-%s.mmi" % (dd.get_genome_build(data), preset))
            if not index_file or not os.path.exists(index_file):
                index_file = dd.get_ref_file(data)
            cmd = ("minimap2 -a -x {preset} -R '{rg_info}' -t {num_cores} {index_file} "
                   "{fastq_file} {pair_file} | ")
            do.run(cmd.format(**locals()) + tobam_cl, "minimap2 alignment: %s" % dd.get_sample_name(data))
    data["work_bam"] = out_file
    return data
Esempio n. 7
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"]))
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    samtools = config_utils.get_program("samtools", data["config"])
    novoalign = config_utils.get_program("novoalign", data["config"])
    resources = config_utils.get_resources("novoalign", data["config"])
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    max_mem = resources.get("memory", "1G")
    extra_novo_args = " ".join(_novoalign_args_from_config(data["config"]))
    rg_info = get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with tx_tmpdir(data) as work_dir:
            with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file):
                tx_out_prefix = os.path.splitext(tx_out_file)[0]
                cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} "
                       "  -c {num_cores} {extra_novo_args} | ")
                cmd = (cmd + tobam_cl).format(**locals())
                do.run(cmd, "Novoalign: %s" % names["sample"], None,
                       [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)])
    data["work_bam"] = out_file
    return data
Esempio n. 8
0
def align(fastq_file,
          pair_file,
          ref_file,
          names,
          align_dir,
          data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += [
                "-q",
                "-v",
                2,
                "-k",
                num_hits,
                "-X",
                2000,  # default is too selective for most data
                "--best",
                "--strata",
                "--sam",
                ref_file
            ]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(
                names)
            cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            do.run(cmd,
                   "Running Bowtie on %s and %s." % (fastq_file, pair_file),
                   data)
    return out_file
Esempio n. 9
0
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted output BAM.
    """
    pair_file = pair_file if pair_file else ""
    # back compatible -- older files were named with lane information, use sample name now
    if names["lane"] != dd.get_sample_name(data):
        out_file = os.path.join(align_dir,
                                "{0}-sort.bam".format(names["lane"]))
    else:
        out_file = None
    if not out_file or not utils.file_exists(out_file):
        umi_ext = "-cumi" if "umi_bam" in data else ""
        out_file = os.path.join(
            align_dir, "{0}-sort{1}.bam".format(dd.get_sample_name(data),
                                                umi_ext))
    qual_format = data["config"]["algorithm"].get("quality_format", "").lower()
    min_size = None
    if data.get("align_split") or fastq_file.endswith(".sdf"):
        if fastq_file.endswith(".sdf"):
            min_size = rtg.min_read_size(fastq_file)
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
        if qual_format == "illumina":
            fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data)
            if pair_file:
                pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data)
    rg_info = novoalign.get_rg_info(names)
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        # If we cannot do piping, use older bwa aln approach
        if ("bwa-mem" not in dd.get_tools_on(data)
                and ("bwa-mem" in dd.get_tools_off(data)
                     or not _can_use_mem(fastq_file, data, min_size))):
            out_file = _align_backtrack(fastq_file, pair_file, ref_file,
                                        out_file, names, rg_info, data)
        else:
            if is_precollapsed_bam(
                    data) or not hla_on(data) or needs_separate_hla(data):
                out_file = _align_mem(fastq_file, pair_file, ref_file,
                                      out_file, names, rg_info, data)
            else:
                out_file = _align_mem_hla(fastq_file, pair_file, ref_file,
                                          out_file, names, rg_info, data)
    data["work_bam"] = out_file

    # bwakit will corrupt the non-HLA alignments in a UMI collapsed BAM file
    # (see https://github.com/bcbio/bcbio-nextgen/issues/3069)
    if needs_separate_hla(data):
        hla_file = os.path.join(os.path.dirname(out_file),
                                "HLA-" + os.path.basename(out_file))
        hla_file = _align_mem_hla(fastq_file, pair_file, ref_file, hla_file,
                                  names, rg_info, data)
        data["hla_bam"] = hla_file
    return data
Esempio n. 10
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None
                                      or not file_exists(final_file)):
        cmd = (
            "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
            "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            splicesites = get_known_splicesites_file(align_dir, data)
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        novel_splicesite_file = os.path.join(
            align_dir,
            "{0}-novelsplicesites.bed".format(dd.get_sample_name(data)))
        cmd += "--novel-splicesite-outfile {novel_splicesite_file} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    junctionbed = get_splicejunction_file(align_dir, data)
    data = dd.set_junction_bed(data, junctionbed)
    return data
Esempio n. 11
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data,
          extra_args=None):
    """Do standard or paired end alignment with bowtie.
    """
    num_hits = 1
    if data["analysis"].lower().startswith("smallrna-seq"):
        num_hits = 1000
    config = data['config']
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
        if fastq_file.endswith(".gz"):
            fastq_file = "<(gunzip -c %s)" % fastq_file
            if pair_file:
                pair_file = "<(gunzip -c %s)" % pair_file

    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cl = [config_utils.get_program("bowtie", config)]
            cl += _bowtie_args_from_config(data)
            cl += extra_args if extra_args is not None else []
            cl += ["-q",
                   "-v", 2,
                   "-k", num_hits,
                   "-X", 2000, # default is too selective for most data
                   "--best",
                   "--strata",
                   "--sam",
                   ref_file]
            if pair_file:
                cl += ["-1", fastq_file, "-2", pair_file]
            else:
                cl += [fastq_file]
            cl = [str(i) for i in cl]
            fix_rg_cmd = r"samtools addreplacerg -r '%s' -" % novoalign.get_rg_info(data["rgnames"])
            if fix_rg_cmd:
                cmd = " ".join(cl) + " | " + fix_rg_cmd + " | " + tobam_cl
            else:
                cmd = " ".join(cl) + " | " + tobam_cl
            do.run(cmd, "Running Bowtie on %s and %s." % (fastq_file, pair_file), data)
    return out_file
Esempio n. 12
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = fastq_file[2:-1]
    else:
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | tr '\t' '\n'")
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            cmd = ("{stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Esempio n. 13
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None or not file_exists(final_file)):
        cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
               "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            splicesites = get_known_splicesites_file(align_dir, data)
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data)))
        cmd += "--novel-splicesite-outfile {novel_splicesite_file} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    junctionbed = get_splicejunction_file(align_dir, data)
    data = dd.set_junction_bed(data, junctionbed)
    return data
Esempio n. 14
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir,
                            "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(
            fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (
                r"paste <({fastq_file} | paste - - - -) "
                r"<({pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (
                r"paste <(zcat {fastq_file} | paste - - - -) "
                r"<(zcat {pair_file} | paste - - - -) | "
                r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                r"""{{ """
                r"""split($1, P1, " "); split($5, P5, " "); """
                r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or
                                            not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file
                                is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = (
                "export TMPDIR={tmp_dir} && unset JAVA_HOME && {stream_input} | "
                "snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(
                cmd.format(**locals()) + tobam_cl,
                "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data
Esempio n. 15
0
def align(fastq_file, pair_file, index_dir, names, align_dir, data):
    """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM.

    Pipes in input, handling paired and split inputs, using interleaving magic
    from: https://biowize.wordpress.com/2015/03/26/the-fastest-darn-fastq-decoupling-procedure-i-ever-done-seen/

    Then converts a tab delimited set of outputs into interleaved fastq.

    awk changes spaces to underscores since SNAP only takes the initial name.
    SNAP requires /1 and /2 at the end of read names. If these are not present
    in the initial fastq may need to expand awk code to do this.
    """
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    num_cores = data["config"]["algorithm"].get("num_cores", 1)
    resources = config_utils.get_resources("snap", data["config"])
    rg_info = novoalign.get_rg_info(names)

    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
        fastq_file = fastq_file[2:-1]
        if pair_file:
            pair_file = pair_file[2:-1]
            stream_input = (r"paste <({fastq_file} | paste - - - -) "
                            r"<({pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = fastq_file[2:-1]
    else:
        final_file = None
        assert fastq_file.endswith(".gz")
        if pair_file:
            stream_input = (r"paste <(zcat {fastq_file} | paste - - - -) "
                            r"<(zcat {pair_file} | paste - - - -) | "
                            r"""awk 'BEGIN {{FS="\t"; OFS="\n"}} """
                            r"""{{ """
                            r"""split($1, P1, " "); split($5, P5, " "); """
                            r"""if ($1 !~ /\/1$/) $1 = P1[1]"/1"; if ($5 !~ /\/2$/) $5 = P5[1]"/2"; """
                            r"""gsub(" ", "_", $1); gsub(" ", "_", $5); """
                            r"""print $1, $2, "+", $4, $5, $6, "+", $8}}' | sponge """)
        else:
            stream_input = "zcat {fastq_file}"

    pair_file = pair_file if pair_file else ""
    if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)):
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            if pair_file:
                sub_cmd = "paired"
                input_cmd = "-pairedInterleavedFastq -"
            else:
                sub_cmd = "single"
                input_cmd = "-fastq -"
            stream_input = stream_input.format(**locals())
            tmp_dir = os.path.dirname(tx_out_file)
            cmd = ("export TMPDIR={tmp_dir} && {stream_input} | snap-aligner {sub_cmd} {index_dir} {input_cmd} "
                   "-R '{rg_info}' -t {num_cores} -M -o -sam - | ")
            do.run(cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"])
    data["work_bam"] = out_file
    return data