Exemple #1
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}
    report_file = os.path.join(out_dir, "qualimapReport.html")
    raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
        do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Exemple #2
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Exemple #3
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(
            fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning("kallisto was run on single-end data and we set the "
          "estimated fragment length to 200 and the standard "
          "deviation to 25, if these don't reflect your data then "
          "the results may be inaccurate. Use with caution. See "
          "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
          "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Exemple #5
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    sentinel_file = os.path.join(quant_dir, "abundance.h5")
    if os.path.exists(sentinel_file):
        return quant_dir
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data,
                           os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(
        data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning(
            "kallisto was run on single-end data and we set the "
            "estimated fragment length to 200 and the standard "
            "deviation to 25, if these don't reflect your data then "
            "the results may be inaccurate. Use with caution. See "
            "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
            "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Exemple #6
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-forward",
        "secondstrand": "strand-specific-reverse",
        "unstranded": "non-strand-specific",
        "auto": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data))
    library = strandedness[dd.get_strandedness(data)]

    # don't run qualimap on the full bam by default
    if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data,
                                    []):
        logger.info(f"Full qualimap analysis for {bam_file} may be slow.")
        ds_bam = bam_file
    else:
        logger.info(f"Downsampling {bam_file} for Qualimap run.")
        ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir)
        bam_file = ds_bam if ds_bam else bam_file

    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir,
                                       gtf_file, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir,
                                           "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data, results_dir))
    metrics.update(
        {"Average_insert_size": salmon.estimate_fragment_size(data)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file),
        "metrics": metrics
    }
Exemple #7
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}
    report_file = os.path.join(out_dir, "qualimapReport.html")
    raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(out_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
        do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, out_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Exemple #8
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    cmd += "--numBootstraps 30 --useVBOpt "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon." %
                   (fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Exemple #9
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        with file_transaction(data, results_dir) as tx_out_dir:
            utils.safe_makedir(tx_out_dir)
            raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt")
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Exemple #10
0
def bcbio_run(data):
    out_dir = os.path.join(dd.get_work_dir(data), "dexseq")
    safe_makedir(out_dir)
    sample_name = dd.get_sample_name(data)
    out_file = os.path.join(out_dir, sample_name + ".dexseq")
    bam_file = dd.get_work_bam(data)
    dexseq_gff = dd.get_dexseq_gff(data)
    stranded = dd.get_strandedness(data)
    return run_count(bam_file, dexseq_gff, stranded, out_file, data)
Exemple #11
0
def _get_stranded_flag(data):
    strand_flag = {"unstranded": "no",
                   "firststrand": "reverse",
                   "secondstrand": "yes"}
    stranded = dd.get_strandedness(data, "unstranded").lower()
    assert stranded in strand_flag, ("%s is not a valid strandedness value. "
                                     "Valid values are 'firststrand', 'secondstrand', "
                                     "and 'unstranded")
    return strand_flag[stranded]
Exemple #12
0
def bcbio_run(data):
    out_dir = os.path.join(dd.get_work_dir(data), "dexseq")
    safe_makedir(out_dir)
    sample_name = dd.get_sample_name(data)
    out_file = os.path.join(out_dir, sample_name + ".dexseq")
    bam_file = dd.get_work_bam(data)
    dexseq_gff = dd.get_dexseq_gff(data)
    stranded = dd.get_strandedness(data)
    return run_count(bam_file, dexseq_gff, stranded, out_file, data)
def _strand_flag(data):
    """
    0: unstranded 1: stranded 2: reverse stranded
    """
    strand_flag = {"unstranded": "0", "firststrand": "2", "secondstrand": "1"}
    stranded = dd.get_strandedness(data)

    assert stranded in strand_flag, (
        "%s is not a valid strandedness value. " "Valid values are 'firststrand', 'secondstrand', " "and 'unstranded"
    )
    return strand_flag[stranded]
def _strand_flag(data):
    """
    0: unstranded 1: stranded 2: reverse stranded
    """
    strand_flag = {"unstranded": "0", "firststrand": "2", "secondstrand": "1"}
    stranded = dd.get_strandedness(data)

    assert stranded in strand_flag, (
        "%s is not a valid strandedness value. "
        "Valid values are 'firststrand', 'secondstrand', "
        "and 'unstranded")
    return strand_flag[stranded]
Exemple #15
0
def _get_stranded_flag(data):
    strand_flag = {
        "unstranded": "no",
        "firststrand": "reverse",
        "secondstrand": "yes"
    }
    stranded = dd.get_strandedness(data, "unstranded").lower()
    assert stranded in strand_flag, (
        "%s is not a valid strandedness value. "
        "Valid values are 'firststrand', 'secondstrand', "
        "and 'unstranded")
    return strand_flag[stranded]
Exemple #16
0
def _set_stranded_flag(bam_file, data):
    strand_flag = {"unstranded": "",
                   "firststrand": "--rf-stranded",
                   "secondstrand": "--fr-stranded",
                   "firststrand-s": "--r-stranded",
                   "secondstrand-s": "--f-stranded"}
    stranded = dd.get_strandedness(data)
    assert stranded in strand_flag, ("%s is not a valid strandedness value. "
            "Valid values are 'firststrand', "
            "'secondstrand' and 'unstranded" % (stranded))
    if stranded != "unstranded" and not is_paired(bam_file):
        stranded += "-s"
    flag = strand_flag[stranded]
    return flag
Exemple #17
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-reverse",
        "secondstrand": "strand-specific-forward",
        "unstranded": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir,
                                       gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir,
                                           "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
                dd.get_sample_name(data), tx_results_file)
            do.run(cmd,
                   "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update(
        {"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {
        "base": base_results_file,
        "secondary": _find_qualimap_secondary_files(results_dir,
                                                    base_results_file),
        "metrics": metrics
    }
Exemple #18
0
def _get_stranded_flag(data, paired):
    strandedness = dd.get_strandedness(data)
    base = "--rna-strandness "
    if paired:
        if strandedness == "firststrand":
            return base + "RF"
        elif strandedness == "secondstrand":
            return base + "FR"
        else:
            return ""
    else:
        if strandedness == "firstrand":
            return base + "R"
        elif strandedness == "secondstrand":
            return base + "F"
        else:
            return ""
Exemple #19
0
def _set_stranded_flag(bam_file, data):
    strand_flag = {
        "unstranded": "",
        "firststrand": "--rf-stranded",
        "secondstrand": "--fr-stranded",
        "firststrand-s": "--r-stranded",
        "secondstrand-s": "--f-stranded"
    }
    stranded = dd.get_strandedness(data)
    assert stranded in strand_flag, ("%s is not a valid strandedness value. "
                                     "Valid values are 'firststrand', "
                                     "'secondstrand' and 'unstranded" %
                                     (stranded))
    if stranded != "unstranded" and not is_paired(bam_file):
        stranded += "-s"
    flag = strand_flag[stranded]
    return flag
def _get_stranded_flag(data, paired):
    strandedness = dd.get_strandedness(data)
    base = "--rna-strandness "
    if paired:
        if strandedness == "firststrand":
            return base + "RF"
        elif strandedness == "secondstrand":
            return base + "FR"
        else:
            return ""
    else:
        if strandedness == "firstrand":
            return base + "R"
        elif strandedness == "secondstrand":
            return base + "F"
        else:
            return ""
Exemple #21
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
Exemple #22
0
def run_sailfish(data):
    samplename = dd.get_sample_name(data)
    files = dd.get_input_sequence_files(data)
    work_dir = dd.get_work_dir(data)
    if len(files) == 2:
        fq1, fq2 = files
    else:
        fq1, fq2 = files[0], None
    sailfish_dir = os.path.join(work_dir, "sailfish", samplename)
    gtf_file = dd.get_gtf_file(data)
    assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file
    fasta_file = dd.get_ref_file(data)
    assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file
    stranded = dd.get_strandedness(data).lower()
    out_file = sailfish(fq1, fq2, sailfish_dir, gtf_file, fasta_file, stranded, data)
    data = dd.set_sailfish(data, out_file)
    data = dd.set_sailfish_dir(data, sailfish_dir)
    return [[data]]
Exemple #23
0
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    # unsure how to estimate from single end data, so go with a reasonable default
    frag_length = 250
    batch_file = umi.convert_to_kallisto(data)
    index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    cmd = ("{kallisto} pseudo --umi "
           "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}")
    with chdir(os.path.dirname(batch_file)):
        with file_transaction(data, quant_dir) as tx_out_dir:
            message = ("Quantifying transcripts with Kallisto.")
            do.run(cmd.format(**locals()), message, None)
    kallisto_table(kallisto_dir, index)
    return quant_dir
Exemple #24
0
def kallisto_singlecell(fq1, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    # unsure how to estimate from single end data, so go with a reasonable default
    frag_length = 250
    batch_file = umi.convert_to_kallisto(data)
    index = kallisto_index(gtf_file, fasta_file, data, kallisto_dir)
    cmd = ("{kallisto} pseudo --umi "
           "-t {num_cores} -o {tx_out_dir} -b {batch_file} -i {index}")
    with chdir(os.path.dirname(batch_file)):
        with file_transaction(data, quant_dir) as tx_out_dir:
            message = ("Quantifying transcripts with Kallisto.")
            do.run(cmd.format(**locals()), message, None)
    kallisto_table(kallisto_dir, index)
    return quant_dir
Exemple #25
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file
Exemple #26
0
def salmon_quant_bam(bam_file, salmon_dir, gtf_file, ref_file, data):
    safe_makedir(salmon_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(salmon_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    gtf_fa = sailfish._create_combined_fasta(data, salmon_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = _libtype_string(bam_file, strandedness)
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant {libtype} -p {num_cores} -t {gtf_fa} "
           "-o {tx_out_dir} -a {bam_file} ")
    cmd += "--numBootstraps 30 "
    with file_transaction(data, salmon_dir) as tx_out_dir:
        message = "Quantifying transcripts in %s with Salmon." % bam_file
        do.run(cmd.format(**locals()), message, None)
    return out_file
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {"firststrand": "strand-specific-reverse",
                    "secondstrand": "strand-specific-forward",
                    "unstranded": "non-strand-specific"}

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    results_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    report_file = os.path.join(results_dir, "qualimapReport.html")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(results_file):
        with file_transaction(data, results_dir) as tx_results_dir:
            utils.safe_makedir(tx_results_dir)
            bam.index(bam_file, config)
            cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library)
            do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
            tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt")
            cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file)
            do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data, results_dir))
    metrics.update({"Average_insert_size": salmon.estimate_fragment_size(data)})
    metrics = _parse_metrics(metrics)
    # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order
    # to keep its name after upload, we need to put  the base QC file (results_file) into the root directory (out_dir):
    base_results_file = os.path.join(out_dir, os.path.basename(results_file))
    shutil.copyfile(results_file, base_results_file)
    return {"base": base_results_file,
            "secondary": _find_qualimap_secondary_files(results_dir, base_results_file),
            "metrics": metrics}
Exemple #28
0
def run_rnaseq(bam_file, data, out_dir):
    """
    Run qualimap for a rnaseq bam file and parse results
    """
    strandedness = {
        "firststrand": "strand-specific-reverse",
        "secondstrand": "strand-specific-forward",
        "unstranded": "non-strand-specific"
    }

    # Qualimap results should be saved to a directory named after sample.
    # MultiQC (for parsing additional data) picks the sample name after the dir as follows:
    #   <sample name>/raw_data_qualimapReport/insert_size_histogram.txt
    results_dir = os.path.join(out_dir, dd.get_sample_name(data))
    report_file = os.path.join(results_dir, "qualimapReport.html")
    raw_file = os.path.join(results_dir, "rnaseq_qc_results.txt")
    config = data["config"]
    gtf_file = dd.get_gtf_file(data)
    single_end = not bam.is_paired(bam_file)
    library = strandedness[dd.get_strandedness(data)]
    if not utils.file_exists(report_file):
        utils.safe_makedir(results_dir)
        bam.index(bam_file, config)
        cmd = _rnaseq_qualimap_cmd(data, bam_file, results_dir, gtf_file,
                                   single_end, library)
        do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data)))
        cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (
            dd.get_sample_name(data), raw_file)
        do.run(cmd,
               "Fix Name Qualimap for {}".format(dd.get_sample_name(data)))
    metrics = _parse_rnaseq_qualimap_metrics(report_file)
    metrics.update(_detect_duplicates(bam_file, results_dir, data))
    metrics.update(_detect_rRNA(data))
    metrics.update(
        {"Average_insert_size": bam.estimate_fragment_size(bam_file)})
    metrics = _parse_metrics(metrics)
    return metrics
Exemple #29
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0

    if file_exists(out_file):
        return out_file

    logger.info("Counting reads mapping to exons in %s using %s as the "
                "annotation and strandedness as %s." %
                (os.path.basename(sam_filename),
                 os.path.basename(gff_filename), dd.get_strandedness(data)))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n" %
                         gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n" %
                         feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar
                              if co.type == "M" and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar
                              if co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar
                                  if co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(
                            iv_seq, (invert_strand(co.ref_iv)
                                     for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                    else:
                        iv_seq = itertools.chain(
                            iv_seq, (co.ref_iv for co in r[1].cigar
                                     if co.type == "M" and co.size > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict"
                      or overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0
                                    or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write(
                    "%d sam %s processed.\n" %
                    (i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n" %
                             read_seq_pe_file.get_line_number_string())
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(data, out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(data, stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file
Exemple #30
0
def htseq_count(data):
    """ adapted from Simon Anders htseq-count.py script
    http://www-huber.embl.de/users/anders/HTSeq/doc/count.html
    """

    sam_filename, gff_filename, out_file, stats_file = _get_files(data)
    stranded = _get_stranded_flag(data["config"])
    overlap_mode = "union"
    feature_type = "exon"
    id_attribute = "gene_id"
    minaqual = 0

    if file_exists(out_file):
        return out_file

    logger.info("Counting reads mapping to exons in %s using %s as the "
                "annotation and strandedness as %s." %
                (os.path.basename(sam_filename), os.path.basename(gff_filename), dd.get_strandedness(data)))

    features = HTSeq.GenomicArrayOfSets("auto", stranded != "no")
    counts = {}

    # Try to open samfile to fail early in case it is not there
    open(sam_filename).close()

    gff = HTSeq.GFF_Reader(gff_filename)
    i = 0
    try:
        for f in gff:
            if f.type == feature_type:
                try:
                    feature_id = f.attr[id_attribute]
                except KeyError:
                    sys.exit("Feature %s does not contain a '%s' attribute" %
                             (f.name, id_attribute))
                if stranded != "no" and f.iv.strand == ".":
                    sys.exit("Feature %s at %s does not have strand "
                             "information but you are running htseq-count "
                             "in stranded mode. Use '--stranded=no'." %
                             (f.name, f.iv))
                features[f.iv] += feature_id
                counts[f.attr[id_attribute]] = 0
            i += 1
            if i % 100000 == 0:
                sys.stderr.write("%d GFF lines processed.\n" % i)
    except:
        sys.stderr.write("Error occured in %s.\n"
                         % gff.get_line_number_string())
        raise

    sys.stderr.write("%d GFF lines processed.\n" % i)

    if len(counts) == 0:
        sys.stderr.write("Warning: No features of type '%s' found.\n"
                         % feature_type)

    try:
        align_reader = htseq_reader(sam_filename)
        first_read = iter(align_reader).next()
        pe_mode = first_read.paired_end
    except:
        sys.stderr.write("Error occured when reading first line of sam "
                         "file.\n")
        raise

    try:
        if pe_mode:
            read_seq_pe_file = align_reader
            read_seq = HTSeq.pair_SAM_alignments(align_reader)
        empty = 0
        ambiguous = 0
        notaligned = 0
        lowqual = 0
        nonunique = 0
        i = 0
        for r in read_seq:
            i += 1
            if not pe_mode:
                if not r.aligned:
                    notaligned += 1
                    continue
                try:
                    if r.optional_field("NH") > 1:
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if r.aQual < minaqual:
                    lowqual += 1
                    continue
                if stranded != "reverse":
                    iv_seq = (co.ref_iv for co in r.cigar if co.type == "M"
                              and co.size > 0)
                else:
                    iv_seq = (invert_strand(co.ref_iv) for co in r.cigar if
                              co.type == "M" and co.size > 0)
            else:
                if r[0] is not None and r[0].aligned:
                    if stranded != "reverse":
                        iv_seq = (co.ref_iv for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                    else:
                        iv_seq = (invert_strand(co.ref_iv) for co in r[0].cigar if
                                  co.type == "M" and co.size > 0)
                else:
                    iv_seq = tuple()
                if r[1] is not None and r[1].aligned:
                    if stranded != "reverse":
                        iv_seq = itertools.chain(iv_seq,
                                                 (invert_strand(co.ref_iv) for co
                                                  in r[1].cigar if co.type == "M"
                                                  and co.size > 0))
                    else:
                        iv_seq = itertools.chain(iv_seq,
                                                 (co.ref_iv for co in r[1].cigar
                                                  if co.type == "M" and co.size
                                                  > 0))
                else:
                    if (r[0] is None) or not (r[0].aligned):
                        notaligned += 1
                        continue
                try:
                    if (r[0] is not None and r[0].optional_field("NH") > 1) or \
                       (r[1] is not None and r[1].optional_field("NH") > 1):
                        nonunique += 1
                        continue
                except KeyError:
                    pass
                if (r[0] and r[0].aQual < minaqual) or (r[1] and
                                                        r[1].aQual < minaqual):
                    lowqual += 1
                    continue

            try:
                if overlap_mode == "union":
                    fs = set()
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            fs = fs.union(fs2)
                elif (overlap_mode == "intersection-strict" or
                      overlap_mode == "intersection-nonempty"):
                    fs = None
                    for iv in iv_seq:
                        if iv.chrom not in features.chrom_vectors:
                            raise UnknownChrom
                        for iv2, fs2 in features[iv].steps():
                            if (len(fs2) > 0 or overlap_mode == "intersection-strict"):
                                if fs is None:
                                    fs = fs2.copy()
                                else:
                                    fs = fs.intersection(fs2)
                else:
                    sys.exit("Illegal overlap mode.")
                if fs is None or len(fs) == 0:
                    empty += 1
                elif len(fs) > 1:
                    ambiguous += 1
                else:
                    counts[list(fs)[0]] += 1
            except UnknownChrom:
                if not pe_mode:
                    rr = r
                else:
                    rr = r[0] if r[0] is not None else r[1]
                empty += 1

            if i % 100000 == 0:
                sys.stderr.write("%d sam %s processed.\n" %
                                 (i, "lines " if not pe_mode else "line pairs"))

    except:
        if not pe_mode:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq.get_line_number_string())
        else:
            sys.stderr.write("Error occured in %s.\n"
                             % read_seq_pe_file.get_line_number_string())
        raise

    sys.stderr.write("%d sam %s processed.\n" %
                     (i, "lines " if not pe_mode else "line pairs"))

    with file_transaction(data, out_file) as tmp_out_file:
        with open(tmp_out_file, "w") as out_handle:
            on_feature = 0
            for fn in sorted(counts.keys()):
                on_feature += counts[fn]
                out_handle.write("%s\t%d\n" % (fn, counts[fn]))

    with file_transaction(data, stats_file) as tmp_stats_file:
        with open(tmp_stats_file, "w") as out_handle:
            out_handle.write("on_feature\t%d\n" % on_feature)
            out_handle.write("no_feature\t%d\n" % empty)
            out_handle.write("ambiguous\t%d\n" % ambiguous)
            out_handle.write("too_low_aQual\t%d\n" % lowqual)
            out_handle.write("not_aligned\t%d\n" % notaligned)
            out_handle.write("alignment_not_unique\t%d\n" % nonunique)

    return out_file