def _start_message(self, in_file, **kwargs): if kwargs: logger.info("Starting %s on %s with arguments %s." % (self.stage, in_file, kwargs)) else: logger.info("Starting %s on %s." % (self.stage, in_file))
def start_cluster(cluster_config): global cluster, view, client, direct_view cluster = Cluster(**cluster_config["cluster"]) logger.info("Starting the cluster with %d nodes." % (cluster.n)) cluster.start() sleep(cluster.delay) # only continue when the cluster is completely up slept = 0 while not cluster.is_up(): sleep(cluster.delay) slept = slept + cluster.delay if slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT): logger.error("Cluster startup timed out.") cluster.stop() exit(-1) # only continue if at least one engine is up logger.info("Cluster up.") client = cluster.client() view = cluster.view() direct_view = cluster.direct_view() engine_config = cluster_config.copy() engine_config["engine_log"] = True direct_view["config"] = engine_config direct_view.execute("from bipy.log import setup_logging") direct_view.execute("setup_logging(config)")
def start_cluster(cluster_config): global cluster, view, client, direct_view cluster = Cluster(**cluster_config["cluster"]) logger.info("Starting the cluster with %d nodes." % (cluster.n)) cluster.start() sleep(cluster.delay) # only continue when the cluster is completely up slept = 0 while (not cluster.is_up()): sleep(cluster.delay) slept = slept + cluster.delay if (slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT)): logger.error("Cluster startup timed out.") cluster.stop() exit(-1) # only continue if at least one engine is up logger.info("Cluster up.") client = cluster.client() view = cluster.view() direct_view = cluster.direct_view() engine_config = cluster_config.copy() engine_config["engine_log"] = True direct_view['config'] = engine_config direct_view.execute('from bipy.log import setup_logging') direct_view.execute('setup_logging(config)')
def hard_clip(in_file, bases=8, right_side=True, quality_format="sanger", out_file=None): """ hard clip a fastq file by removing N bases from each read bases is the number of bases to clip right_side is True to trim from the right side, False to trim from the left example: hard_clip(fastq_file, bases=4, end="5prime") """ if right_side: logger.info("Hard clipping %d bases from the right side of " "reads in %s." % (bases, in_file)) else: logger.info("Hard clipping %d bases from the left side of " "reads in %s." % (bases, in_file)) quality_type = QUALITY_TYPE_HARD_TRIM[quality_format] if not out_file: out_file = append_stem(in_file, "clip") if file_exists(out_file): return out_file in_iterator = SeqIO.parse(in_file, quality_type) out_iterator = (_trim_read(record, bases, right_side) for record in in_iterator) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: SeqIO.write(out_iterator, out_handle, quality_type) return out_file
def genebody_coverage2(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts, takes a bam file, converts it to bigwig and then uses that """ PROGRAM = "geneBody_coverage2.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) in_bigwig = bam2bigwig(in_file, config) prefix = "coverage" out_dir = os.path.join(os.path.dirname(in_bigwig), os.pardir, "coverage") safe_makedir(out_dir) out_prefix = out_dir + "/wiggle" #out_prefix = _get_out_prefix(in_bigwig, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) coverage_run(i=in_bigwig, r=bed, o=out_prefix, t="pdf") return coverage_plot_file
def filter_reads_by_length(fq1, fq2, min_length=30): """ removes reads which are empty a pair of fastq files """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) # just pick the first one if it can be multiple types quality_type = QUALITY_TYPE[DetectFastqFormat.run(fq1)[0]] fq1_out = append_stem(fq1, "fixed") fq2_out = append_stem(fq2, "fixed") fq1_single = append_stem(fq1, "singles") fq2_single = append_stem(fq2, "singles") if all(map(file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_type) fq2_in = SeqIO.parse(fq2, quality_type) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_type)) fq2_out_handle.write(fq2_record.format(quality_type)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_type)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_type)) return [fq1_out, fq2_out]
def _run_fastqc(curr_files, config): logger.info("Running fastqc on %s" % (str(curr_files))) nfiles = len(curr_files) fastqc_config = config["stage"]["fastqc"] out_files = view.map(fastqc.run, curr_files, [fastqc_config] * nfiles, [config] * nfiles) return out_files
def _run_se(self, in_file): # cut polyA tails and adapters off logger.info("Running cutadapt in single end mode on %s." % (in_file)) trimmed_file = self._cut_file(in_file) out_file = self._get_lf_file(trimmed_file) if file_exists(out_file): return out_file fastq.filter_single_reads_by_length(trimmed_file, self.length_cutoff) return out_file
def _run_pe(self, in_files): logger.info("Running cutadapt in paired end mode on %s." % (in_files)) trimmed_files = map(self._cut_file, in_files) out_files = map(self._get_lf_file, trimmed_files) if all(map(file_exists, out_files)): return out_files fastq.filter_reads_by_length(trimmed_files[0], trimmed_files[1], self.length_cutoff) return out_files
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def _cut_file(self, in_file): """ run cutadapt on a single file """ adapters = self._get_adapters(self.chemistry) out_file = self.in2trimmed(in_file) if file_exists(out_file): return out_file cutadapt = sh.Command(self.stage_config.get("program", "cutadapt")) quality_format = self.quality_format if not quality_format: quality_format = self._detect_fastq_format(in_file) if quality_format == "sanger": logger.info("Quality format detected as sanger.") quality_base = 33 elif quality_format == "illumina": logger.info("Quality format set to illumina 1.5/1.3") quality_base = 64 else: logger.error("Quality format could not be detected. Quality " "Detected or set as %s. It should be illumina " "or sanger.") exit(1) # if we want to trim the polya tails we have to first remove # the adapters and then trim the tail if self.stage_config.get("trim_polya", True): temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq", dir=self.out_dir) # trim off adapters cutadapt(in_file, self.options, adapters, quality_base=quality_base, _out=temp_cut.name) with file_transaction(out_file) as temp_out: polya = ADAPTERS.get("polya") # trim off polya cutadapt(temp_cut.name, self.options, "-a", polya, "-a", self._rc_adapters(polya), quality_base=quality_base, _out=temp_out) return out_file else: with file_transaction(out_file) as temp_out: cutadapt(in_file, self.options, adapters, _out=temp_out) return out_file
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline in_dir = config["dir"]["data"] id_file = config["id_file"] curr_files = input_files_from_dir(in_dir, id_file) logger.info("Running pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = fastqc.FastQC(config) view.map(stage_runner, curr_files, block=False) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = trim.Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "bowtie": logger.info("Running bowtie on %s." % (curr_files)) bowtie = Bowtie(config) curr_files = view.map(bowtie, curr_files) mapped = view.map(sam.only_mapped, curr_files) unmapped = view.map(sam.only_unmapped, curr_files) curr_files = mapped bam_files = view.map(sam.sam2bam, mapped) bam_sorted = view.map(sam.bamsort, bam_files) view.map(sam.bamindex, bam_sorted) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [ replace_suffix(os.path.basename(x), "metrics") for x in curr_files ] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) stop_cluster()
def test_cluster(): with open(CONFIG_FILE) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view logger.info("Serial result") serial_result = map(mappable_function, range(32)) logger.info("Parallel result") parallel_result = view.map(mappable_function, range(32)) assert (serial_result == parallel_result)
def __init__(self, config): self.config = config self.plugins = {} #self.scan(get_in(config, "dir", "plugins")) plugin_dir = get_in(config, ("dir", "plugins")) if plugin_dir: logger.info("Scanning %s for plugins." % plugin_dir) plugins = types.ModuleType("plugins") plugins.__path__ = [plugin_dir] sys.modules["plugins"] = plugins self.scan(plugin_dir) else: self.scan()
def test_cluster(): with open(CONFIG_FILE) as in_handle: config = yaml.load(in_handle) setup_logging(config) from bipy.log import logger start_cluster(config) from bipy.cluster import view logger.info("Serial result") serial_result = map(mappable_function, range(32)) logger.info("Parallel result") parallel_result = view.map(mappable_function, range(32)) assert(serial_result == parallel_result)
def _run_trim(curr_files, config): logger.info("Trimming poor quality ends from %s" % (str(curr_files))) nfiles = len(curr_files) min_length = str(config["stage"]["trim"].get("min_length", 20)) pair = str(config["stage"]["trim"].get("pair", "se")) platform = str(config["stage"]["trim"].get("platform", "sanger")) out_dir = os.path.join(config["dir"]["results"], "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(sickle.run, curr_files, [pair] * nfiles, [platform] * nfiles, [min_length] * nfiles, out_files) return out_files
def is_up(self): """ returns True if the cluster is completely up and false otherwise """ try: up = len(self.client().ids) except IOError: logger.info("Waiting for the controller to come up.") return False else: not_up = self.n - up if not_up > 0: logger.info("Waiting for %d engines to come up." % (not_up)) return False else: return True
def run_with_config(input_file, config, control_file=None, stage=None): if stage is None: stage = "macs" if stage not in config["stage"]: logger.info("Cannot find the the stage %s in the config." % (stage)) stage_config = config["stage"][stage] options = stage_config.get("options", []) out_dir = os.path.join(config["dir"].get("results", None), stage) safe_makedir(out_dir) out_files = run(input_file, options, control_file, out_dir) print out_files return out_files
def _download_encode(input_file, config): """ grab the encode files they listed in their file """ NAME_FIELD = 0 if not os.path.exists(input_file): logger.info("Error %s does not exist, aborting." % (input_file)) exit(-1) with open(input_file) as in_handle: reader = csv.reader(in_handle, delimiter="\t") files = [x[NAME_FIELD] for x in reader] logger.info("Downloading %s." % (files)) data_dir = config["dir"].get("data", "data") out_files = view.map(_download_ref, files, [data_dir] * len(files)) return out_files
def annotate_table_with_biomart(in_file, join_column, filter_type, organism, out_file=None): """ join_column is the column to combine the perform the lookups on filter_type describes the type of the join_column (see the getBM documentation in R for details), organism is the english name of the organism example: annotate_table_with_biomart(in_file, "id", "ensembl_gene_id", "human") """ if organism not in ORG_TO_ENSEMBL: logger.error("organism not supported") exit(1) logger.info("Annotating %s." % (organism)) if not out_file: out_file = append_stem(in_file, "annotated") if os.path.exists(out_file): return out_file # use biomaRt to annotate the data file r = robjects.r r.assign('join_column', join_column) r.assign('in_file', in_file) r.assign('out_file', out_file) r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"]) r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"]) r.assign('filter_type', filter_type) r(''' library(biomaRt) ensembl = useMart("ensembl", dataset = ensembl_gene) d = read.table(in_file, header=TRUE) a = getBM(attributes=c(filter_type, gene_symbol, "description"), filters=c(filter_type), values=d[,join_column], mart=ensembl) m = merge(d, a, by.x=join_column, by.y=filter_type) write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t") ''') return out_file
def filter_single_reads_by_length(in_file, min_length=30): """ removes reads from a fastq file which are below a min_length in bases """ logger.info("Removing reads in %s thare are less than %d bases." % (in_file, min_length)) quality_type = QUALITY_TYPE[DetectFastqFormat.run(in_file)[0]] out_file = append_stem(in_file, "fixed") if file_exists(out_file): return out_file in_iterator = SeqIO.parse(in_file, quality_type) out_iterator = (record for record in in_iterator if len(record.seq) > min_length) with file_transaction(out_file) as tmp_out_file: with open(tmp_out_file, "w") as out_handle: SeqIO.write(out_iterator, out_handle, quality_type) return out_file
def junction_annotation(in_file, config, out_prefix=None): """ compile novel/known information about splice junctions """ PROGRAM = "junction_annotation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "junction" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) junction_file = out_prefix + ".splice_junction.pdf" if file_exists(junction_file): return junction_file junction_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) junction_run(i=in_file, o=out_prefix, r=bed) return junction_file
def RPKM_count(in_file, config, out_prefix=None): """ produce RPKM """ PROGRAM = "RPKM_count.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_count" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_count_file = out_prefix + "_read_count.xls" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_count_file): return rpkm_count_file RPKM_count_run = sh.Command(which(PROGRAM)) RPKM_count_run(i=in_file, r=bed, o=out_prefix) return rpkm_count_file
def genebody_coverage(in_file, config, out_prefix=None): """ used to check the 5'/3' bias across transcripts """ PROGRAM = "geneBody_coverage.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "coverage" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) coverage_plot_file = out_prefix + ".geneBodyCoverage.pdf" if file_exists(coverage_plot_file): return coverage_plot_file gtf = _get_gtf(config) bed = _gtf2bed(gtf) coverage_run = sh.Command(which(PROGRAM)) coverage_run(i=in_file, r=bed, o=out_prefix) return coverage_plot_file
def bam_stat(in_file, config, out_prefix=None): """ dump read maping statistics from a SAM or BAM file to out_file """ PROGRAM = "bam_stat.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bam_stat" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) out_file = out_prefix + ".txt" if file_exists(out_file): return out_file bam_stat_run = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: bam_stat_run(i=in_file, _err=tx_out_file) return out_file
def RPKM_saturation(in_file, config, out_prefix=None): """ estimate the precision of RPKM calculation by resampling """ PROGRAM = "RPKM_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "RPKM_saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) rpkm_saturation_file = out_prefix + ".saturation.pdf" gtf = _get_gtf(config) bed = _gtf2bed(gtf) if file_exists(rpkm_saturation_file): return rpkm_saturation_file RPKM_saturation_run = sh.Command(which(PROGRAM)) RPKM_saturation_run(i=in_file, r=bed, o=out_prefix) return rpkm_saturation_file
def junction_saturation(in_file, config, out_prefix=None): """ check if splicing is deep enough to perform alternative splicing analysis """ PROGRAM = "junction_saturation.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "saturation" out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) saturation_file = out_prefix + ".junctionSaturation_plot.pdf" if file_exists(saturation_file): return saturation_file saturation_run = sh.Command(which(PROGRAM)) gtf = _get_gtf(config) bed = _gtf2bed(gtf) saturation_run(i=in_file, o=out_prefix, r=bed) return saturation_file
def run(in_file, ref, blastn_config, config): logger.info("Preparing the reference file for %s." % (ref.get("name"))) ref_file = prepare_ref_file(ref, config) logger.info("Preparing the blast database for %s." % (ref.get("name"))) blast_db = prepare_blast_db(ref_file, "nucl") logger.info("Blasting %s against %s." % (in_file, ref.get("name"))) results_dir = build_results_dir(blastn_config, config) utils.safe_makedir(results_dir) out_file = os.path.join( results_dir, replace_suffix(os.path.basename(in_file), ref.get("name") + "hits.tsv")) tmp_out = out_file + ".tmp" blast_results = blast_search(in_file, blast_db, tmp_out) #logger.info("Filtering results for at least %f percent of the " # "sequences covered." %(0.5*100)) #filtered_results = filter_results_by_length(blast_results, 0.5) #logger.info("Filtered output file here: %s" %(filtered_results)) with open(blast_results) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: writer.writerow(line) return out_file
def run(in_file, ref, blastn_config, config): logger.info("Preparing the reference file for %s." % (ref.get("name"))) ref_file = prepare_ref_file(ref, config) logger.info("Preparing the blast database for %s." % (ref.get("name"))) blast_db = prepare_blast_db(ref_file, "nucl") logger.info("Blasting %s against %s." % (in_file, ref.get("name"))) results_dir = build_results_dir(blastn_config, config) utils.safe_makedir(results_dir) out_file = os.path.join(results_dir, replace_suffix(os.path.basename(in_file), ref.get("name") + "hits.tsv")) tmp_out = out_file + ".tmp" blast_results = blast_search(in_file, blast_db, tmp_out) #logger.info("Filtering results for at least %f percent of the " # "sequences covered." %(0.5*100)) #filtered_results = filter_results_by_length(blast_results, 0.5) #logger.info("Filtered output file here: %s" %(filtered_results)) with open(blast_results) as in_handle: reader = csv.reader(in_handle, delimiter="\t") with open(out_file, "w") as out_handle: writer = csv.writer(out_handle, delimiter="\t") writer.writerow(HEADER_FIELDS.split(" ")) for line in reader: writer.writerow(line) return out_file
def clipping_profile(in_file, config, out_prefix=None): """ estimate the clipping profile of the reads """ PROGRAM = "clipping_profile.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "clipping" out_prefix = _get_out_prefix(in_file, config, out_prefix, "clipping") clip_plot_file = out_prefix + ".clipping_profile.pdf" print clip_plot_file if file_exists(clip_plot_file): return clip_plot_file clip_run = sh.Command(which(PROGRAM)) clip_run(i=in_file, o=out_prefix) # hack to get around the fact that clipping_profile saves the file in # the script execution directory #sh.mv("clipping_profile.pdf", clip_plot_file) return clip_plot_file
def bam2bigwig(in_file, config, out_prefix=None): """ assumes the library preparation was not strand specific for now """ PROGRAM = "bam2wig.py" if not program_exists(PROGRAM): logger.info("%s is not in the path or is not executable." % (PROGRAM)) exit(1) prefix = "bigwig" chrom_size_file = config["annotation"].get("chrom_size_file", None) out_prefix = _get_out_prefix(in_file, config, out_prefix, prefix) if not chrom_size_file: chrom_size_file = _fetch_chrom_sizes(config) wiggle_file = out_prefix + ".wig" if not file_exists(wiggle_file): bam2wig = sh.Command(which(PROGRAM)) bam2wig(i=in_file, s=chrom_size_file, o=out_prefix) bigwig_file = out_prefix + ".bw" return wig2bigwig(wiggle_file, chrom_size_file, bigwig_file)
def __call__(self, in_file): self._start_message(in_file) if isinstance(in_file, basestring): logger.info("Detected %s as non-paired." % in_file) out_file = run_with_config(in_file, None, self.ref, self.stage, self.config) elif is_pair(in_file): logger.info("Detected %s as a pair." % in_file) out_file = run_with_config(in_file[0], in_file[1], self.ref, self.stage, self.config) else: logger.info("Detected %s as non-paired." % in_file) out_file = run_with_config(in_file[0], None, self.ref, self.stage, self.config) self._end_message(in_file) return out_file
def _end_message(self, in_file): logger.info("%s complete on %s." % (self.stage, in_file))
def _start_message(self, in_file): logger.info("Starting %s on %s." % (self.stage, in_file))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for thesis pipeline input_dirs = config["input_dirs"] results_dir = config["dir"].get("results", "results") input_files = _find_input_files(config) conditions = _group_input_by_condition(input_files) logger.info("Input_files: %s" % (input_files)) logger.info("Condition groups %s" %(conditions)) htseq_outdict = {} for condition, curr_files in conditions.items(): condition_dir = os.path.join(results_dir, condition) safe_makedir(condition_dir) config["dir"]["results"] = condition_dir for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running tophat on %s." % (curr_files)) stage_runner = Tophat(config) tophat_outputs = view.map(stage_runner, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_outdict[condition] = htseq_outputs if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # combine htseq-count files and run deseq on them conditions, htseq_files = dict_to_vectors(htseq_outdict) deseq_config = _get_stage_config(config, "deseq") cell_types = _group_input_by_cell_type(htseq_files) for cell_type, files in cell_types.items(): for comparison in deseq_config["comparisons"]: comparison_name = "_vs_".join(comparison) deseq_dir = os.path.join(results_dir, "deseq", cell_type, comparison_name) safe_makedir(deseq_dir) out_file = os.path.join(deseq_dir, comparison_name + ".counts.txt") files_by_condition = _group_input_by_condition(files) _emit_stage_message("deseq", files_by_condition) c, f = dict_to_vectors(files_by_condition) combined_out = htseq_count.combine_counts(f, None, out_file) deseq_out = os.path.join(deseq_dir, comparison_name) logger.info("Running deseq on %s with conditions %s " "and writing ot %s" % (combined_out, conditions, deseq_out)) deseq_out = view.map(deseq.run, [combined_out], [c], [deseq_out]) annotate.annotate_table_with_biomart(deseq_out[0], "id", "ensembl_gene_id", "human") # end gracefully stop_cluster()
def main(config_file): """ this assumes that we are keeping the same order of the files throughout """ with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) input_dict = config["input"] curr_files = _make_current_files(input_dict.keys()) input_meta = input_dict.values() for stage in config["run"]: if stage == "fastqc": _emit_stage_message(stage, curr_files) fastqc_config = _get_stage_config(config, stage) fastqc_args = zip(*product(curr_files, [fastqc_config], [config])) view.map(fastqc.run, *fastqc_args) if stage == "cutadapt": _emit_stage_message(stage, curr_files) cutadapt_config = _get_stage_config(config, stage) cutadapt_args = zip(*product(curr_files, [cutadapt_config], [config])) cutadapt_outputs = view.map(cutadapt_tool.run, *cutadapt_args) curr_files = _make_current_files(cutadapt_outputs) if stage == "tophat": _emit_stage_message(stage, curr_files) tophat_config = _get_stage_config(config, stage) tophat_args = zip(*product(curr_files, [None], [config["ref"]], ["tophat"], [config])) tophat_outputs = view.map(tophat.run_with_config, *tophat_args) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "htseq-count": _emit_stage_message(stage, curr_files) htseq_config = _get_stage_config(config, stage) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) combined_out = os.path.join(config["dir"]["results"], stage, "all_combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, None, out_file=combined_out) if stage == "rseqc": _emit_stage_message(stage, curr_files) rseqc_config = _get_stage_config(config, stage) rseq_args = zip(*product(curr_files, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_transcript_id"], ["mouse"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) view.map(rseqc.RPKM_saturation, *RPKM_args) if stage == "coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"]) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) if stage == "deseq": _emit_stage_message(stage, curr_files) deseq_config = _get_stage_config(config, stage) out_dir = os.path.join(config["dir"]["results"], stage) safe_makedir(out_dir) for test in deseq_config["tests"]: indexes = [_find_file_index_for_test(input_meta, condition) for condition in test] files = [htseq_outputs[x] for x in indexes] conditions = [input_meta[x]["condition"] for x in indexes] combined_out = os.path.join(out_dir, "_".join(conditions) + "_combined.counts") logger.info("Combining %s to %s." % (files, combined_out)) count_file = htseq_count.combine_counts(files, None, out_file=combined_out) out_file = os.path.join(out_dir, "_".join(conditions) + "_deseq.txt") logger.info("Running deseq on %s with conditions %s " "and writing to %s" % (count_file, conditions, out_file)) view.map(deseq.run, [count_file], [conditions], [out_file]) #deseq.run(count_file, conditions, out_file=out_file) # end gracefully stop_cluster()
def mappable_function(x): logger.error("This is an error.") logger.info("This is info.") return x ** 10
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["input_dir"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) logger.info("Output of cutadapt: %s." % (curr_files)) if stage == "bowtie": logger.info("Running Bowtie on %s." % (curr_files)) bowtie = Bowtie(config) bowtie_outputs = view.map(bowtie, curr_files) bamfiles = view.map(sam.sam2bam, bowtie_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (curr_files)) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) sortsam = view.map(sam.coordinate_sort_sam, tophat_outputs, [config] * len(tophat_outputs)) bamfiles = view.map(sam.sam2bam, sortsam) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) down_args = zip(*product(final_bamfiles, [40000000])) down_bam = view.map(sam.downsample_bam, *down_args) view.map(rseqc.genebody_coverage, down_bam, [config] * len(down_bam)) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def _emit_stage_message(stage, curr_files): logger.info("Running %s on %s" % (stage, curr_files))
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) setup_logging(config) start_cluster(config) # after the cluster is up, import the view to i from bipy.cluster import view input_files = config["input"] results_dir = config["dir"]["results"] # make the needed directories map(safe_makedir, config["dir"].values()) curr_files = input_files ## qc steps for stage in config["run"]: if stage == "fastqc": # run the basic fastqc logger.info("Running %s on %s" % (stage, str(curr_files))) fastqc_config = config["stage"][stage] fastqc_outputs = view.map(fastqc.run, curr_files, [fastqc_config] * len(curr_files), [config] * len(curr_files)) # this does nothing for now, not implemented yet summary_file = _combine_fastqc(fastqc_outputs) if stage == "trim": logger.info("Trimming poor quality ends " " from %s" % (str(curr_files))) nlen = len(curr_files) min_length = str(config["stage"][stage].get("min_length", 20)) # trim low quality ends of reads # do this dirty for now out_dir = os.path.join(results_dir, "trimmed") safe_makedir(out_dir) out_files = [append_stem(os.path.basename(x), "trim") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] # XXX remove the magic number of 10 the length of the # minimum read to keep out_files = view.map(sickle.run, curr_files, ["se"] * nlen, ["sanger"] * nlen, [min_length] * nlen, out_files) curr_files = out_files if stage == "tagdust": input_files = curr_files # remove tags matching the other miRNA tested logger.info("Running %s on %s." % (stage, input_files)) tagdust_config = config["stage"][stage] tagdust_outputs = view.map(tagdust.run, input_files, [tagdust_config] * len(input_files), [config] * len(input_files)) curr_files = [x[0] for x in tagdust_outputs] if stage == "filter_length": # filter out reads below or above a certain length filter_config = config["stage"][stage] min_length = filter_config.get("min_length", 0) max_length = filter_config.get("max_length", MAX_READ_LENGTH) # length predicate def length_filter(x): return min_length < len(x.seq) < max_length # filter the input reads based on length # parallelizing this doesn't seem to work # ipython can't accept closures as an argument to view.map() """ filtered_fastq = view.map(filter_seqio, tagdust_outputs, [lf] * len(tagdust_outputs), ["filt"] * len(tagdust_outputs), ["fastq"] * len(tagdust_outputs))""" out_files = [append_stem(os.path.basename(input_file[0]), "filt") for input_file in tagdust_outputs] out_dir = os.path.join(config["dir"]["results"], "length_filtered") safe_makedir(out_dir) out_files = [os.path.join(out_dir, x) for x in out_files] filtered_fastq = [filter_seqio(x[0], length_filter, y, "fastq") for x, y in zip(tagdust_outputs, out_files)] curr_files = filtered_fastq if stage == "count_ends": logger.info("Compiling nucleotide counts at 3' and 5' ends.") # count the nucleotide at the end of each read def count_ends(x, y): """ keeps a running count of an arbitrary set of keys during the reduce step """ x[y] = x.get(y, 0) + 1 return x def get_3prime_end(x): return str(x.seq[-1]) def get_5prime_end(x): return str(x.seq[0]) def output_counts(end_function, count_file): # if the count_file already exists, skip outdir = os.path.join(config["dir"]["results"], stage) safe_makedir(outdir) count_file = os.path.join(outdir, count_file) if os.path.exists(count_file): return count_file # outputs a tab file of the counts at the end # of the fastq files kj counts = [reduce(count_ends, apply_seqio(x, end_function, kind="fastq"), {}) for x in curr_files] df = pd.DataFrame(counts, index=map(_short_name, curr_files)) df = df.astype(float) total = df.sum(axis=1) df = df.div(total, axis=0) df["total"] = total df.to_csv(count_file, sep="\t") output_counts(get_3prime_end, "3prime_counts.tsv") output_counts(get_5prime_end, "5prime_counts.tsv") if stage == "tophat": tophat_config = config["stage"][stage] logger.info("Running tophat on %s" % (str(curr_files))) nlen = len(curr_files) pair_file = None ref_file = tophat_config["annotation"] out_base = os.path.join(results_dir, "mirna") align_dir = os.path.join(results_dir, "tophat") config = config tophat_files = view.map(tophat.align, curr_files, [pair_file] * nlen, [ref_file] * nlen, [out_base] * nlen, [align_dir] * nlen, [config] * nlen) curr_files = tophat_files if stage == "novoalign": logger.info("Running novoalign on %s" % (str(curr_files))) # align ref = config["genome"]["file"] novoalign_config = config["stage"][stage] aligned_outputs = view.map(novoalign.run, curr_files, [ref] * len(curr_files), [novoalign_config] * len(curr_files), [config] * len(curr_files)) # convert sam to bam, sort and index picard = BroadRunner(config["program"]["picard"], None, {}) bamfiles = view.map(picardrun.picard_formatconverter, [picard] * len(aligned_outputs), aligned_outputs) sorted_bf = view.map(picardrun.picard_sort, [picard] * len(bamfiles), bamfiles) view.map(picardrun.picard_index, [picard] * len(sorted_bf), sorted_bf) # these files are the new starting point for the downstream # analyses, so copy them over into the data dir and setting # them to read only #data_dir = os.path.join(config["dir"]["data"], stage) #safe_makedir(data_dir) #view.map(shutil.copy, sorted_bf, [data_dir] * len(sorted_bf)) #new_files = [os.path.join(data_dir, x) for x in # map(os.path.basename, sorted_bf)] #[os.chmod(x, stat.S_IREAD | stat.S_IRGRP) for x in new_files] # index the bam files for later use #view.map(picardrun.picard_index, [picard] * len(new_files), # new_files) curr_files = sorted_bf if stage == "new_coverage": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) nrun = len(curr_files) ref = blastn.prepare_ref_file(config["stage"][stage]["ref"], config) ribo = config["stage"][stage]["ribo"] picard = BroadRunner(config["program"]["picard"], None, {}) out_dir = os.path.join(results_dir, "new_coverage") safe_makedir(out_dir) out_files = [replace_suffix(os.path.basename(x), "metrics") for x in curr_files] out_files = [os.path.join(out_dir, x) for x in out_files] out_files = view.map(picardrun.picard_rnaseq_metrics, [picard] * nrun, curr_files, [ref] * nrun, [ribo] * nrun, out_files) curr_files = out_files if stage == "coverage": gtf = blastn.prepare_ref_file(config["annotation"], config) logger.info("Calculating coverage of features in %s for %s" % (gtf, str(sorted_bf))) out_files = [replace_suffix(x, "counts.bed") for x in sorted_bf] out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) logger.info(out_files) out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] logger.info(out_files) view.map(bedtools.count_overlaps, sorted_bf, [gtf] * len(sorted_bf), out_files) if stage == "htseq-count": nfiles = len(curr_files) htseq_config = _get_stage_config(config, stage) htseq_outputs = view.map(htseq_count.run_with_config, aligned_outputs, [config] * nfiles, [stage] * nfiles) column_names = _get_short_names(input_files) logger.info("Column names: %s" % (column_names)) out_file = os.path.join(config["dir"]["results"], stage, "combined.counts") combined_out = htseq_count.combine_counts(htseq_outputs, column_names, out_file) if stage == "bedtools_intersect": bedfiles = config["stage"]["bedtools_intersect"].get("bed", None) out_dir = os.path.join(results_dir, stage) safe_makedir(out_dir) for bedfile in bedfiles: bedbase, bedext = os.path.splitext(bedfile) out_files = [remove_suffix(x) for x in sorted_bf] out_files = [os.path.join(out_dir, os.path.basename(x)) for x in out_files] out_files = ["_vs_".join([x, os.path.basename(bedbase)]) for x in out_files] out_files = [".".join([x, "bam"]) for x in out_files] test_out = map(bedtools.intersectbam2bed, sorted_bf, [bedfile] * len(sorted_bf), [False] * len(sorted_bf), out_files) count_files = [replace_suffix(x, "stats") for x in out_files] map(write_ratios, sorted_bf, out_files, count_files) if stage == "piranha": piranha_runner = piranha.PiranhaStage(config) out_files = view.map(piranha_runner, curr_files) stop_cluster()
def main(config_file): # load yaml config file with open(config_file) as in_handle: config = yaml.load(in_handle) # setup logging setup_logging(config) from bipy.log import logger # start cluster start_cluster(config) from bipy.cluster import view found = sh.find(config["dir"]["data"], "-name", "Variations") var_dirs = [str(x).strip() for x in found] logger.info("Var_dirs: %s" % (var_dirs)) in_dirs = map(os.path.dirname, var_dirs) logger.info("in_dirs: %s" % (in_dirs)) # XXX for testing only load 3 #curr_files = in_dirs[0:5] curr_files = in_dirs # run the illumina fixer logger.info("Running illumina fixer on %s." % (curr_files)) illf_class = STAGE_LOOKUP.get("illumina_fixer") illf = illf_class(config) curr_files = view.map(illf, curr_files) # sort the vcf files def sort_vcf(in_file): from bipy.utils import append_stem from bcbio.distributed.transaction import file_transaction from bcbio.utils import file_exists import sh out_file = append_stem(in_file, "sorted") if file_exists(out_file): return out_file with file_transaction(out_file) as tmp_out_file: sh.vcf_sort(in_file, _out=tmp_out_file) return out_file # combine out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] # break the VCF files up by chromosome for speed logger.info("Breaking up %s by chromosome." % (curr_files)) breakvcf_class = STAGE_LOOKUP.get("breakvcf") breakvcf = breakvcf_class(config) curr_files = view.map(breakvcf, curr_files) # run VEP on the separate files in parallel logger.info("Running VEP on %s." % (curr_files)) vep_class = STAGE_LOOKUP.get("vep") vep = vep_class(config) curr_files = view.map(vep, list(flatten(curr_files))) curr_files = filter(file_exists, curr_files) # load the files into gemini not in parallel # don't run in parallel # sort the vcf files logger.info("Sorting %s." % (curr_files)) curr_files = view.map(sort_vcf, curr_files) # don't run the rest of this in parallel, so take the cluster down stop_cluster() out_file = os.path.join(config["dir"].get("results", "results"), "geminiloader", "all_combined.vep.vcf") logger.info("Combining files %s into %s." % (curr_files, out_file)) if file_exists(out_file): curr_files = [out_file] else: curr_files = [genotype.combine_variant_files(curr_files, out_file, config["ref"]["fasta"], config)] logger.info("Loading %s into gemini." % (curr_files)) gemini_class = STAGE_LOOKUP.get("geminiloader") geminiloader = gemini_class(config) curr_files = map(geminiloader, curr_files) logger.info("Run complete.")
def main(config_file): with open(config_file) as in_handle: config = yaml.load(in_handle) # make the needed directories map(safe_makedir, config["dir"].values()) # specific for project input_dir = config["dir"]["data"] logger.info("Loading files from %s" % (input_dir)) input_files = list(locate("*.fq", input_dir)) input_files += list(locate("*.fastq", input_dir)) logger.info("Input files: %s" % (input_files)) results_dir = config["dir"]["results"] safe_makedir(results_dir) # make the stage repository repository = StageRepository(config) logger.info("Stages found: %s" % (repository.plugins)) if config.get("test_pipeline", False): logger.info("Running a test pipeline on a subset of the reads.") results_dir = os.path.join(results_dir, "test_pipeline") config["dir"]["results"] = results_dir safe_makedir(results_dir) curr_files = map(make_test, input_files, [config] * len(input_files)) logger.info("Converted %s to %s. " % (input_files, curr_files)) else: curr_files = input_files logger.info("Running RNASeq alignment pipeline on %s." % (curr_files)) for stage in config["run"]: if stage == "fastqc": logger.info("Running fastqc on %s." % (curr_files)) stage_runner = FastQC(config) view.map(stage_runner, curr_files) if stage == "cutadapt": curr_files = combine_pairs(curr_files) logger.info("Running cutadapt on %s." % (curr_files)) stage_runner = Cutadapt(config) curr_files = view.map(stage_runner, curr_files) if stage == "tophat": logger.info("Running Tophat on %s." % (curr_files)) #tophat = repository["tophat"](config) tophat = Tophat(config) tophat_outputs = view.map(tophat, curr_files) bamfiles = view.map(sam.sam2bam, tophat_outputs) bamsort = view.map(sam.bamsort, bamfiles) view.map(sam.bamindex, bamsort) final_bamfiles = bamsort curr_files = tophat_outputs if stage == "disambiguate": logger.info("Disambiguating %s." % (curr_files)) disambiguate = repository[stage](config) view.map(disambiguate, curr_files) if stage == "htseq-count": logger.info("Running htseq-count on %s." % (bamfiles)) name_sorted = view.map(sam.bam_name_sort, bamfiles) curr_files = view.map(sam.bam2sam, name_sorted) htseq_args = zip(*product(curr_files, [config], [stage])) htseq_outputs = view.map(htseq_count.run_with_config, *htseq_args) htseq_count.combine_counts(htseq_outputs) if stage == "rnaseq_metrics": logger.info("Calculating RNASeq metrics on %s." % (curr_files)) #coverage = repository[stage](config) coverage = RNASeqMetrics(config) view.map(coverage, curr_files) if stage == "rseqc": logger.info("Running rseqc on %s." % (curr_files)) #rseq_args = zip(*product(curr_files, [config])) rseq_args = zip(*product(final_bamfiles, [config])) view.map(rseqc.bam_stat, *rseq_args) view.map(rseqc.genebody_coverage, *rseq_args) view.map(rseqc.junction_annotation, *rseq_args) view.map(rseqc.junction_saturation, *rseq_args) RPKM_args = zip(*product(final_bamfiles, [config])) RPKM_count_out = view.map(rseqc.RPKM_count, *RPKM_args) RPKM_count_fixed = view.map(rseqc.fix_RPKM_count_file, RPKM_count_out) """ annotate_args = zip(*product(RPKM_count_fixed, ["gene_id"], ["ensembl_gene_id"], ["human"])) view.map(annotate.annotate_table_with_biomart, *annotate_args) """ view.map(rseqc.RPKM_saturation, *rseq_args) curr_files = tophat_outputs # end gracefully stop_cluster()
def __call__(self, in_file, MAX_RECORDS=1000000): logger.info("Detecting format of %s" % (in_file)) quality = self.run(in_file, MAX_RECORDS) logger.info("Detected quality format of %s in %s." % (quality, in_file)) return self.run(in_file, MAX_RECORDS)