Exemple #1
0
def genebody_coverage2(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts, takes a bam file,
    converts it to bigwig and then uses that
    """
    PROGRAM = "geneBody_coverage2.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    in_bigwig = bam2bigwig(in_file, config)
    prefix = "coverage"
    out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage")
    safe_makedir(out_dir)
    out_prefix = out_dir + "/wiggle"
    #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf"))
    do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None)
    return coverage_plot_file
Exemple #2
0
def _build_command(input_file, ref, novoalign_config):
    cmd = [
        which("novoalign"),
        flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f",
        input_file
    ]
    return list(map(str, flatten(cmd)))
Exemple #3
0
def wig2bigwig(wiggle_file, chrom_size_file, out_file):
    """
    convert wiggle file to bigwig file using the UCSC tool
    """
    PROGRAM = "wigToBigWig"
    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if file_exists(out_file):
        return out_file

    wigToBigWig = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        wigToBigWig(wiggle_file, chrom_size_file, tx_out_file)
    return out_file
Exemple #4
0
def junction_annotation(in_file, config, out_prefix=None):
    """
    compile novel/known information about splice junctions
    """
    PROGRAM = "junction_annotation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "junction"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    junction_file = out_prefix + ".splice_junction.pdf"
    if file_exists(junction_file):
        return junction_file
    junction_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    junction_run(i=in_file, o=out_prefix, r=bed)
    return junction_file
Exemple #5
0
def RPKM_count(in_file, config, out_prefix=None):
    """
    produce RPKM
    """
    PROGRAM = "RPKM_count.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_count"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_count_file = out_prefix + "_read_count.xls"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    if file_exists(rpkm_count_file):
        return rpkm_count_file
    RPKM_count_run = sh.Command(which(PROGRAM))
    RPKM_count_run(i=in_file, r=bed, o=out_prefix)
    return rpkm_count_file
Exemple #6
0
def genebody_coverage(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts
    """
    PROGRAM = "geneBody_coverage.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "coverage"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    coverage_run(i=in_file, r=bed, o=out_prefix)
    return coverage_plot_file
Exemple #7
0
def bam_stat(in_file, config, out_prefix=None):
    """
    dump read maping statistics from a SAM or BAM file to out_file
    """
    PROGRAM = "bam_stat.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bam_stat"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    out_file = out_prefix + ".txt"
    if file_exists(out_file):
        return out_file

    bam_stat_run = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        bam_stat_run(i=in_file, _err=tx_out_file)

    return out_file
Exemple #8
0
def wig2bigwig(wiggle_file, chrom_size_file, out_file):
    """
    convert wiggle file to bigwig file using the UCSC tool
    """
    PROGRAM = "wigToBigWig"
    if not program_exists(PROGRAM):
        logger.error("%s is not in the path or is not executable. Make sure "
                     "it is installed or go to "
                     "http://hgdownload.cse.ucsc.edu/admin/exe/"
                     "to download it." % (PROGRAM))
        exit(1)

    if file_exists(out_file):
        return out_file

    wigToBigWig = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        cmd = str(wigToBigWig.bake(wiggle_file, chrom_size_file, tx_out_file))
        do.run(cmd, "Converting %s from wig to bigwig." % (wiggle_file), None)
    return out_file
Exemple #9
0
def count_overlaps(in_file, bed, out_file=None):
    """ calculates coverage across the features in the bedfile
    bed """

    if not which("coverageBed"):
        logger.error("Cannot find coverageBed. Make sure it is in your "
                     "path or install bedtools.")
        exit(-1)

    if not out_file:
        out_file = replace_suffix(in_file, ".counts")

    if os.path.exists(out_file):
        return out_file

    cmd = ["coverageBed", "-abam", in_file, "-b", bed]

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)
    return out_file
Exemple #10
0
def count_overlaps(in_file, bed, out_file=None):
    """ calculates coverage across the features in the bedfile
    bed """

    if not which("coverageBed"):
        logger.error("Cannot find coverageBed. Make sure it is in your "
                     "path or install bedtools.")
        exit(-1)

    if not out_file:
        out_file = replace_suffix(in_file, ".counts")

    if os.path.exists(out_file):
        return out_file

    cmd = ["coverageBed", "-abam", in_file, "-b", bed]

    with open(out_file, "w") as out_handle:
        subprocess.check_call(cmd, stdout=out_handle)
    return out_file
Exemple #11
0
def junction_saturation(in_file, config, out_prefix=None):
    """
    check if splicing is deep enough to perform alternative splicing
    analysis
    """
    PROGRAM = "junction_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    saturation_file = out_prefix + ".junctionSaturation_plot.pdf"
    if file_exists(saturation_file):
        return saturation_file

    saturation_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    saturation_run(i=in_file, o=out_prefix, r=bed)
    return saturation_file
Exemple #12
0
def genebody_coverage(in_file, config, out_prefix=None):
    """
    used to check the 5'/3' bias across transcripts
    """
    PROGRAM = "geneBody_coverage.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "coverage"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf"
    if file_exists(coverage_plot_file):
        return coverage_plot_file

    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    coverage_run = sh.Command(which(PROGRAM))
    cmd = str(coverage_run.bake(i=in_file, r=bed, o=out_prefix))
    do.run(cmd, "Calculating coverage of %s." % (in_file), None)
    return coverage_plot_file
Exemple #13
0
def RPKM_saturation(in_file, config, out_prefix=None):
    """
    estimate the precision of RPKM calculation by resampling
    """
    PROGRAM = "RPKM_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_saturation_file = out_prefix + ".saturation.pdf"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)

    if file_exists(rpkm_saturation_file):
        return rpkm_saturation_file

    RPKM_saturation_run = sh.Command(which(PROGRAM))
    RPKM_saturation_run(i=in_file, r=bed, o=out_prefix)
    return rpkm_saturation_file
Exemple #14
0
def clipping_profile(in_file, config, out_prefix=None):
    """
    estimate the clipping profile of the reads
    """
    PROGRAM = "clipping_profile.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "clipping"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping")
    clip_plot_file = out_prefix + ".clipping_profile.pdf"
    print clip_plot_file
    if file_exists(clip_plot_file):
        return clip_plot_file

    clip_run = sh.Command(which(PROGRAM))
    cmd = str(clip_run.bake(i=in_file, o=out_prefix))
    do.run(cmd, "Calculating 5' and 3' clipping profile of %s." % (in_file), None)

    return clip_plot_file
Exemple #15
0
def RPKM_count(in_file, config, out_prefix=None):
    """
    produce RPKM
    """
    PROGRAM = "RPKM_count.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_count"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_count_file = out_prefix + "_read_count.xls"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    if file_exists(rpkm_count_file):
        return rpkm_count_file
    RPKM_count_run = sh.Command(which(PROGRAM))
    cmd = str(RPKM_count_run.bake(i=in_file, r=bed, o=out_prefix))
    do.run(cmd, "Calculating RPKM of %s using reference %s." % (in_file, bed),
           None)
    return rpkm_count_file
Exemple #16
0
def junction_annotation(in_file, config, out_prefix=None):
    """
    compile novel/known information about splice junctions
    """
    PROGRAM = "junction_annotation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "junction"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    junction_file = out_prefix + ".splice_junction.pdf"
    if file_exists(junction_file):
        return junction_file
    junction_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    cmd = str(junction_run.bake(i=in_file, o=out_prefix, r=bed))
    do.run(cmd, "Calculating novel/known information about splice junctions of "
           "%s." % (in_file), None)
    return junction_file
Exemple #17
0
def bam_stat(in_file, config, out_prefix=None):
    """
    dump read maping statistics from a SAM or BAM file to out_file
    """
    PROGRAM = "bam_stat.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bam_stat"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    out_file = out_prefix + ".txt"
    if file_exists(out_file):
        return out_file

    bam_stat_run = sh.Command(which(PROGRAM))
    with file_transaction(out_file) as tx_out_file:
        cmd = str(bam_stat_run.bake(i=in_file, _err=tx_out_file))
        do.run(cmd, "Calculating BAM statistics from %s." % (in_file), None)

    return out_file
Exemple #18
0
def RPKM_saturation(in_file, config, out_prefix=None):
    """
    estimate the precision of RPKM calculation by resampling
    """
    PROGRAM = "RPKM_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "RPKM_saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    rpkm_saturation_file = out_prefix + ".saturation.pdf"
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)

    if file_exists(rpkm_saturation_file):
        return rpkm_saturation_file

    RPKM_saturation_run = sh.Command(which(PROGRAM))
    cmd = str(RPKM_saturation_run.bake(i=in_file, r=bed, o=out_prefix))
    do.run(cmd, "Calculating RPKM saturation of %s." % in_file, None)
    return rpkm_saturation_file
Exemple #19
0
    def check_program(self, config):
        need_dirs = ["picard"]
        programs = config.get("program", None)
        if not programs:
            self._field_missing_warning("program")
            return True

        for program, loc in programs.items():
            if program in need_dirs:
                if os.path.isdir(loc):
                    logger.info("%s lives in %s." % (program, loc))
                else:
                    self._directory_missing_error(program, loc)
                    return False
            else:
                if which(loc):
                    logger.info("%s will be run as %s." % (program, loc))
                else:
                    self._executable_missing_error(program, loc)
                    return False

        logger.info("The 'program' section of configuration seems valid.")
        return True
Exemple #20
0
def junction_saturation(in_file, config, out_prefix=None):
    """
    check if splicing is deep enough to perform alternative splicing
    analysis
    """
    PROGRAM = "junction_saturation.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "saturation"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    saturation_file = out_prefix + ".junctionSaturation_plot.pdf"
    if file_exists(saturation_file):
        return saturation_file

    saturation_run = sh.Command(which(PROGRAM))
    gtf = _get_gtf(config)
    bed = _gtf2bed(gtf)
    cmd = str(saturation_run.bake(i=in_file, o=out_prefix, r=bed))
    do.run(cmd, "Calculating junction saturation estimation of %s." % in_file,
           None)
    return saturation_file
Exemple #21
0
def clipping_profile(in_file, config, out_prefix=None):
    """
    estimate the clipping profile of the reads
    """
    PROGRAM = "clipping_profile.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "clipping"
    out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping")
    clip_plot_file = out_prefix + ".clipping_profile.pdf"
    print clip_plot_file
    if file_exists(clip_plot_file):
        return clip_plot_file

    clip_run = sh.Command(which(PROGRAM))
    clip_run(i=in_file, o=out_prefix)
    # hack to get around the fact that clipping_profile saves the file in
    # the script execution directory
    #sh.mv("clipping_profile.pdf", clip_plot_file)

    return clip_plot_file
Exemple #22
0
def bam2bigwig(in_file, config, out_prefix=None):
    """
    assumes the library preparation was not strand specific for now
    """
    PROGRAM = "bam2wig.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bigwig"
    chrom_size_file = config["annotation"].get("chrom_size_file", None)
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    if not chrom_size_file:
        chrom_size_file = _fetch_chrom_sizes(config)
    wiggle_file = out_prefix + ".wig"

    if not file_exists(wiggle_file):
        bam2wig = sh.Command(which(PROGRAM))
        bam2wig(i=in_file, s=chrom_size_file, o=out_prefix)

    bigwig_file = out_prefix + ".bw"

    return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
Exemple #23
0
def bam2bigwig(in_file, config, out_prefix=None):
    """
    assumes the library preparation was not strand specific for now
    """
    PROGRAM = "bam2wig.py"
    if not program_exists(PROGRAM):
        logger.info("%s is not in the path or is not executable." % (PROGRAM))
        exit(1)

    prefix = "bigwig"
    chrom_size_file = config["annotation"].get("chrom_size_file", None)
    out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix)
    if not chrom_size_file:
        chrom_size_file = _fetch_chrom_sizes(config)
    wiggle_file = out_prefix + ".wig"

    if not file_exists(wiggle_file):
        bam2wig = sh.Command(which(PROGRAM))
        cmd = str(bam2wig.bake(i=in_file, s=chrom_size_file, o=out_prefix))
        do.run(cmd, "Converting %s from BAM to bigwig" % (in_file), None)

    bigwig_file = out_prefix + ".bw"

    return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
Exemple #24
0
def program_exists(path):
    return which(path)
Exemple #25
0
def _build_command(input_file, ref, novoalign_config):
    cmd = [which("novoalign"), flatten_options(novoalign_config),
           "-o", "SAM", "-d", ref, "-f", input_file]
    return list(map(str, flatten(cmd)))