def genebody_coverage2(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts, takes a bam file, converts it to bigwig and then uses that """ PROGRAM = "geneBody_coverage2.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) in_bigwig = bam2bigwig(in_file, config) prefix = "coverage" out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage") safe_makedir(out_dir) out_prefix = out_dir + "/wiggle" #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) cmd = str(coverage_run.bake(i=in_bigwig, r=bed, o=out_prefix, t="pdf")) do.run(cmd, "Calculating coverage of %s." % (in_bigwig), None) return coverage_plot_file
def _build_command(input_file, ref, novoalign_config): cmd = [ which("novoalign"), flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f", input_file ] return list(map(str, flatten(cmd)))
def wig2bigwig(wiggle_file, chrom_size_file, out_file): """ convert wiggle file to bigwig file using the UCSC tool """ PROGRAM = "wigToBigWig" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if file_exists(out_file): return out_file wigToBigWig = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: wigToBigWig(wiggle_file, chrom_size_file, tx_out_file) return out_file
def junction_annotation(in_file, config, out_prefix=None): """ compile novel/known information about splice junctions """ PROGRAM = "junction_annotation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "junction" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) junction_file = out_prefix + ".splice_junction.pdf" if file_exists(junction_file): return junction_file junction_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) junction_run(i=in_file, o=out_prefix, r=bed) return junction_file
def RPKM_count(in_file, config, out_prefix=None): """ produce RPKM """ PROGRAM = "RPKM_count.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_count" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_count_file = out_prefix + "_read_count.xls" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_count_file): return rpkm_count_file RPKM_count_run = sh.Command(which(PROGRAM)) RPKM_count_run(i=in_file, r=bed, o=out_prefix) return rpkm_count_file
def genebody_coverage(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts """ PROGRAM = "geneBody_coverage.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "coverage" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) coverage_run(i=in_file, r=bed, o=out_prefix) return coverage_plot_file
def bam_stat(in_file, config, out_prefix=None): """ dump read maping statistics from a SAM or BAM file to out_file """ PROGRAM = "bam_stat.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bam_stat" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) out_file = out_prefix + ".txt" if file_exists(out_file): return out_file bam_stat_run = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: bam_stat_run(i=in_file, _err=tx_out_file) return out_file
def wig2bigwig(wiggle_file, chrom_size_file, out_file): """ convert wiggle file to bigwig file using the UCSC tool """ PROGRAM = "wigToBigWig" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if file_exists(out_file): return out_file wigToBigWig = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: cmd = str(wigToBigWig.bake(wiggle_file, chrom_size_file, tx_out_file)) do.run(cmd, "Converting %s from wig to bigwig." % (wiggle_file), None) return out_file
def count_overlaps(in_file, bed, out_file=None): """ calculates coverage across the features in the bedfile bed """ if not which("coverageBed"): logger.error("Cannot find coverageBed. Make sure it is in your " "path or install bedtools.") exit(-1) if not out_file: out_file = replace_suffix(in_file, ".counts") if os.path.exists(out_file): return out_file cmd = ["coverageBed", "-abam", in_file, "-b", bed] with open(out_file, "w") as out_handle: subprocess.check_call(cmd, stdout=out_handle) return out_file
def junction_saturation(in_file, config, out_prefix=None): """ check if splicing is deep enough to perform alternative splicing analysis """ PROGRAM = "junction_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) saturation_file = out_prefix + ".junctionSaturation_plot.pdf" if file_exists(saturation_file): return saturation_file saturation_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) saturation_run(i=in_file, o=out_prefix, r=bed) return saturation_file
def genebody_coverage(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts """ PROGRAM = "geneBody_coverage.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "coverage" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) cmd = str(coverage_run.bake(i=in_file, r=bed, o=out_prefix)) do.run(cmd, "Calculating coverage of %s." % (in_file), None) return coverage_plot_file
def RPKM_saturation(in_file, config, out_prefix=None): """ estimate the precision of RPKM calculation by resampling """ PROGRAM = "RPKM_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_saturation_file = out_prefix + ".saturation.pdf" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_saturation_file): return rpkm_saturation_file RPKM_saturation_run = sh.Command(which(PROGRAM)) RPKM_saturation_run(i=in_file, r=bed, o=out_prefix) return rpkm_saturation_file
def clipping_profile(in_file, config, out_prefix=None): """ estimate the clipping profile of the reads """ PROGRAM = "clipping_profile.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "clipping" out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping") clip_plot_file = out_prefix + ".clipping_profile.pdf" print clip_plot_file if file_exists(clip_plot_file): return clip_plot_file clip_run = sh.Command(which(PROGRAM)) cmd = str(clip_run.bake(i=in_file, o=out_prefix)) do.run(cmd, "Calculating 5' and 3' clipping profile of %s." % (in_file), None) return clip_plot_file
def RPKM_count(in_file, config, out_prefix=None): """ produce RPKM """ PROGRAM = "RPKM_count.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_count" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_count_file = out_prefix + "_read_count.xls" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_count_file): return rpkm_count_file RPKM_count_run = sh.Command(which(PROGRAM)) cmd = str(RPKM_count_run.bake(i=in_file, r=bed, o=out_prefix)) do.run(cmd, "Calculating RPKM of %s using reference %s." % (in_file, bed), None) return rpkm_count_file
def junction_annotation(in_file, config, out_prefix=None): """ compile novel/known information about splice junctions """ PROGRAM = "junction_annotation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "junction" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) junction_file = out_prefix + ".splice_junction.pdf" if file_exists(junction_file): return junction_file junction_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) cmd = str(junction_run.bake(i=in_file, o=out_prefix, r=bed)) do.run(cmd, "Calculating novel/known information about splice junctions of " "%s." % (in_file), None) return junction_file
def bam_stat(in_file, config, out_prefix=None): """ dump read maping statistics from a SAM or BAM file to out_file """ PROGRAM = "bam_stat.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bam_stat" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) out_file = out_prefix + ".txt" if file_exists(out_file): return out_file bam_stat_run = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: cmd = str(bam_stat_run.bake(i=in_file, _err=tx_out_file)) do.run(cmd, "Calculating BAM statistics from %s." % (in_file), None) return out_file
def RPKM_saturation(in_file, config, out_prefix=None): """ estimate the precision of RPKM calculation by resampling """ PROGRAM = "RPKM_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_saturation_file = out_prefix + ".saturation.pdf" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_saturation_file): return rpkm_saturation_file RPKM_saturation_run = sh.Command(which(PROGRAM)) cmd = str(RPKM_saturation_run.bake(i=in_file, r=bed, o=out_prefix)) do.run(cmd, "Calculating RPKM saturation of %s." % in_file, None) return rpkm_saturation_file
def check_program(self, config): need_dirs = ["picard"] programs = config.get("program", None) if not programs: self._field_missing_warning("program") return True for program, loc in programs.items(): if program in need_dirs: if os.path.isdir(loc): logger.info("%s lives in %s." % (program, loc)) else: self._directory_missing_error(program, loc) return False else: if which(loc): logger.info("%s will be run as %s." % (program, loc)) else: self._executable_missing_error(program, loc) return False logger.info("The 'program' section of configuration seems valid.") return True
def junction_saturation(in_file, config, out_prefix=None): """ check if splicing is deep enough to perform alternative splicing analysis """ PROGRAM = "junction_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) saturation_file = out_prefix + ".junctionSaturation_plot.pdf" if file_exists(saturation_file): return saturation_file saturation_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) cmd = str(saturation_run.bake(i=in_file, o=out_prefix, r=bed)) do.run(cmd, "Calculating junction saturation estimation of %s." % in_file, None) return saturation_file
def clipping_profile(in_file, config, out_prefix=None): """ estimate the clipping profile of the reads """ PROGRAM = "clipping_profile.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "clipping" out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping") clip_plot_file = out_prefix + ".clipping_profile.pdf" print clip_plot_file if file_exists(clip_plot_file): return clip_plot_file clip_run = sh.Command(which(PROGRAM)) clip_run(i=in_file, o=out_prefix) # hack to get around the fact that clipping_profile saves the file in # the script execution directory #sh.mv("clipping_profile.pdf", clip_plot_file) return clip_plot_file
def bam2bigwig(in_file, config, out_prefix=None): """ assumes the library preparation was not strand specific for now """ PROGRAM = "bam2wig.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bigwig" chrom_size_file = config["annotation"].get("chrom_size_file", None) out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) if not chrom_size_file: chrom_size_file = _fetch_chrom_sizes(config) wiggle_file = out_prefix + ".wig" if not file_exists(wiggle_file): bam2wig = sh.Command(which(PROGRAM)) bam2wig(i=in_file, s=chrom_size_file, o=out_prefix) bigwig_file = out_prefix + ".bw" return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
def bam2bigwig(in_file, config, out_prefix=None): """ assumes the library preparation was not strand specific for now """ PROGRAM = "bam2wig.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bigwig" chrom_size_file = config["annotation"].get("chrom_size_file", None) out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) if not chrom_size_file: chrom_size_file = _fetch_chrom_sizes(config) wiggle_file = out_prefix + ".wig" if not file_exists(wiggle_file): bam2wig = sh.Command(which(PROGRAM)) cmd = str(bam2wig.bake(i=in_file, s=chrom_size_file, o=out_prefix)) do.run(cmd, "Converting %s from BAM to bigwig" % (in_file), None) bigwig_file = out_prefix + ".bw" return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
def program_exists(path): return which(path)
def _build_command(input_file, ref, novoalign_config): cmd = [which("novoalign"), flatten_options(novoalign_config), "-o", "SAM", "-d", ref, "-f", input_file] return list(map(str, flatten(cmd)))