def start_cluster(cluster_config): global cluster, view, client, direct_view cluster = Cluster(**cluster_config["cluster"]) logger.info("Starting the cluster with %d nodes." % (cluster.n)) cluster.start() sleep(cluster.delay) # only continue when the cluster is completely up slept = 0 while (not cluster.is_up()): sleep(cluster.delay) slept = slept + cluster.delay if (slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT)): logger.error("Cluster startup timed out.") cluster.stop() exit(-1) # only continue if at least one engine is up logger.info("Cluster up.") client = cluster.client() view = cluster.view() direct_view = cluster.direct_view() engine_config = cluster_config.copy() engine_config["engine_log"] = True direct_view['config'] = engine_config direct_view.execute('from bipy.log import setup_logging') direct_view.execute('setup_logging(config)')
def _get_gtf(config): gtf = config["annotation"].get("file", None) #gtf = config.get("gtf", None) if not gtf or not file_exists(gtf): logger.error("genebody_coverage needs a GTF file passed to it.") exit(1) return gtf
def start_cluster(cluster_config): global cluster, view, client, direct_view cluster = Cluster(**cluster_config["cluster"]) logger.info("Starting the cluster with %d nodes." % (cluster.n)) cluster.start() sleep(cluster.delay) # only continue when the cluster is completely up slept = 0 while not cluster.is_up(): sleep(cluster.delay) slept = slept + cluster.delay if slept > cluster_config["cluster"].get("timeout", DEFAULT_CLUSTER_TIMEOUT): logger.error("Cluster startup timed out.") cluster.stop() exit(-1) # only continue if at least one engine is up logger.info("Cluster up.") client = cluster.client() view = cluster.view() direct_view = cluster.direct_view() engine_config = cluster_config.copy() engine_config["engine_log"] = True direct_view["config"] = engine_config direct_view.execute("from bipy.log import setup_logging") direct_view.execute("setup_logging(config)")
def _make_current_files(curr_files): """ makes sure the list of files is non zero and exists """ for curr_file in curr_files: if not file_exists(curr_file): logger.error("%s does not exist or is size 0. Aborting." % (curr_file)) exit(1) return curr_files
def _add_entry(d, v): base = os.path.basename(v) k = base.split(delimiter)[0] if "PbN" in k: d["PbN"] = d.get("PbN", []) + [v] elif "Pb" in k: d["Pb"] = d.get("Pb", []) + [v] else: logger.error("Error grouping by cell type") exit(-1) return d
def _cut_file(self, in_file): """ run cutadapt on a single file """ adapters = self._get_adapters(self.chemistry) out_file = self.in2trimmed(in_file) if file_exists(out_file): return out_file cutadapt = sh.Command(self.stage_config.get("program", "cutadapt")) quality_format = self.quality_format if not quality_format: quality_format = self._detect_fastq_format(in_file) if quality_format == "sanger": logger.info("Quality format detected as sanger.") quality_base = 33 elif quality_format == "illumina": logger.info("Quality format set to illumina 1.5/1.3") quality_base = 64 else: logger.error("Quality format could not be detected. Quality " "Detected or set as %s. It should be illumina " "or sanger.") exit(1) # if we want to trim the polya tails we have to first remove # the adapters and then trim the tail if self.stage_config.get("trim_polya", True): temp_cut = tempfile.NamedTemporaryFile(suffix=".fastq", dir=self.out_dir) # trim off adapters cutadapt(in_file, self.options, adapters, quality_base=quality_base, _out=temp_cut.name) with file_transaction(out_file) as temp_out: polya = ADAPTERS.get("polya") # trim off polya cutadapt(temp_cut.name, self.options, "-a", polya, "-a", self._rc_adapters(polya), quality_base=quality_base, _out=temp_out) return out_file else: with file_transaction(out_file) as temp_out: cutadapt(in_file, self.options, adapters, _out=temp_out) return out_file
def _fetch_chrom_sizes(config): PROGRAM = "fetchChromSizes" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if "annotation" not in config: logger.error("'annotation' must be in the yaml file. See example " " configuration files") exit(1) if "name" not in config["annotation"]: logger.error("'name' must be in the yaml file under " " 'annotation'. See example configuration files.") exit(1) genome = config["annotation"]["name"] chrom_size_file = os.path.join(_results_dir(config), genome + ".sizes") if file_exists(chrom_size_file): return chrom_size_file with file_transaction(chrom_size_file) as tmp_chrom_size_file: sh.fetchChromSizes(genome, _out=tmp_chrom_size_file) if not file_exists(chrom_size_file): logger.error("chromosome size file does not exist. Check " "'annotation': 'name' to make sure it is valid.") exit(1) return chrom_size_file
def _validate_config(in_file, stage_config, config): """ validates that a set of assumptions about the config file needed to run the program are true """ if "ref" not in config: logger.error("ref: must appear in the config file") exit(1) if not file_exists(config["ref"] + ".fa"): logger.error("%s not found, aborting." % (config["ref_fasta"])) if not file_exists(in_file): logger.error("%s not found, aborting." % (in_file)) if not file_exists(config["gtf"]): logger.error("%s not found, aborting." % (config["gtf"])) if not file_exists(stage_config["program"]): logger.error("%s not found, aborting." % (stage_config["program"]))
def run_with_config(input_file, config, stage, out_file=None): stage_config = config["stage"][stage] options = stage_config.get("options", []) if out_file is None: out_dir = os.path.join(config["dir"].get("results", None), stage) out_file = os.path.join(out_dir, _get_outfilename(input_file)) safe_makedir(out_dir) if "annotation" not in config: logger.error("annotation must appear in the config file, see example " "configuration files.") exit(1) ref = prepare_ref_file(config["annotation"], config) out_file = run(input_file, ref, options, out_file) return out_file
def setup_pipeline(config): """ creates output directories and performs some minor validation on the configuration file """ # make initial directories if "dir" not in config: logger.error("'dir' must be in config file, see example " " configurations.") exit(-1) config = _setup_config(config) map(safe_makedir, config["dir"].values()) _write_config(config) return config
def annotate_table_with_biomart(in_file, join_column, filter_type, organism, out_file=None): """ join_column is the column to combine the perform the lookups on filter_type describes the type of the join_column (see the getBM documentation in R for details), organism is the english name of the organism example: annotate_table_with_biomart(in_file, "id", "ensembl_gene_id", "human") """ if organism not in ORG_TO_ENSEMBL: logger.error("organism not supported") exit(1) logger.info("Annotating %s." % (organism)) if not out_file: out_file = append_stem(in_file, "annotated") if os.path.exists(out_file): return out_file # use biomaRt to annotate the data file r = robjects.r r.assign('join_column', join_column) r.assign('in_file', in_file) r.assign('out_file', out_file) r.assign('ensembl_gene', ORG_TO_ENSEMBL[organism]["gene_ensembl"]) r.assign('gene_symbol', ORG_TO_ENSEMBL[organism]["gene_symbol"]) r.assign('filter_type', filter_type) r(''' library(biomaRt) ensembl = useMart("ensembl", dataset = ensembl_gene) d = read.table(in_file, header=TRUE) a = getBM(attributes=c(filter_type, gene_symbol, "description"), filters=c(filter_type), values=d[,join_column], mart=ensembl) m = merge(d, a, by.x=join_column, by.y=filter_type) write.table(m, out_file, quote=FALSE, row.names=FALSE, sep="\t") ''') return out_file
def wig2bigwig(wiggle_file, chrom_size_file, out_file): """ convert wiggle file to bigwig file using the UCSC tool """ PROGRAM = "wigToBigWig" if not program_exists(PROGRAM): logger.error("%s is not in the path or is not executable. Make sure " "it is installed or go to " "http://hgdownload.cse.ucsc.edu/admin/exe/" "to download it." % (PROGRAM)) exit(1) if file_exists(out_file): return out_file wigToBigWig = sh.Command(which(PROGRAM)) with file_transaction(out_file) as tx_out_file: wigToBigWig(wiggle_file, chrom_size_file, tx_out_file) return out_file
def scan(self, plugin_dir=None): files = os.listdir(PluginDirectory) if plugin_dir: sys.path.append(plugin_dir) files += os.listdir(plugin_dir) plugins = [] for fn in files: if fn.endswith('.py'): plugins.append(fn) mods = [fn.split('.')[0] for fn in plugins] # build the map of plugins for modname in mods: try: mod = import_(modname, PackagePrefix) except: logger.error("Error loading plugin: %s" % modname) traceback.print_exc() continue self.scan_module(mod)
def mappable_function(x): logger.error("This is an error.") logger.info("This is info.") return x ** 10
def mappable_function(x): logger.error("This is an error.") logger.info("This is info.") return x**10