def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: raise ValueError("Did not find genome build %s in bcbio installation: %s" % (genome_build, os.path.normpath(loc_file))) else: cur_ref = refs[-1] # Find genome directory and check for packed wf tarballs cur_ref_norm = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) base_dir_i = cur_ref_norm.find("/%s/" % genome_build) base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build) for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")): cwlutils.unpack_tarballs(tarball, {"dirs": {"work": base_dir}}, use_subdir=False) if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def _get_full_paths(config, config_file): """Retrieve full paths for directories in the case of relative locations. """ #fastq_dir = utils.add_full_path(fastq_dir) config_dir = utils.add_full_path(os.path.dirname(config_file)) galaxy_config_file = utils.add_full_path(config["galaxy_config"], config_dir) return os.path.dirname(galaxy_config_file), config_dir
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build ] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: logger.info("Downloading %s %s from AWS" % (genome_build, name)) cur_ref = download_prepped_genome(genome_build, data, name, need_remap) # allow multiple references in a file and use the most recently added else: cur_ref = refs[-1] # Find genome directory and check for packed wf tarballs cur_ref_norm = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) base_dir_i = cur_ref_norm.find("/%s/" % genome_build) base_dir = os.path.join(cur_ref_norm[:base_dir_i], genome_build) for tarball in glob.glob(os.path.join(base_dir, "*-wf.tar.gz")): cwlutils.unpack_tarballs(tarball, {"dirs": { "work": base_dir }}, use_subdir=False) if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def _get_full_paths(fastq_dir, config, config_file): """Retrieve full paths for directories in the case of relative locations. """ if fastq_dir: fastq_dir = utils.add_full_path(fastq_dir) config_dir = utils.add_full_path(os.path.dirname(config_file)) galaxy_config_file = utils.add_full_path(config.get("galaxy_config", "universe_wsgi.ini"), config_dir) return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
def _get_full_paths(fastq_dir, config, config_file): """Retrieve full paths for directories in the case of relative locations. """ fastq_dir = utils.add_full_path(fastq_dir) config_dir = utils.add_full_path(os.path.dirname(config_file)) galaxy_config_file = utils.add_full_path(config["galaxy_config"], config_dir) return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
def _get_full_paths(fastq_dir, config, config_file): """Retrieve full paths for directories in the case of relative locations. """ if fastq_dir: fastq_dir = utils.add_full_path(fastq_dir) config_dir = utils.add_full_path(os.path.dirname(config_file)) galaxy_config_file = utils.add_full_path( config.get("galaxy_config", "universe_wsgi.ini"), config_dir) return fastq_dir, os.path.dirname(galaxy_config_file), config_dir
def get_genome_ref(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ if not genome_build: return (None, None) ref_dir = os.path.join(galaxy_base, "tool-data") out_info = [] for ref_get in [aligner, "samtools"]: if not ref_get: out_info.append(None) continue ref_file = os.path.join(ref_dir, _tools[ref_get].galaxy_loc_file) cur_ref = None with open(ref_file) as in_handle: for line in in_handle: if line.strip() and not line.startswith("#"): parts = line.strip().split() if parts[0] == "index": parts = parts[1:] if parts[0] == genome_build: cur_ref = parts[-1] break if cur_ref is None: raise IndexError("Genome %s not found in %s" % (genome_build, ref_file)) remap_fn = _tools[ref_get].remap_index_fn if remap_fn: cur_ref = remap_fn(cur_ref) out_info.append(utils.add_full_path(cur_ref, ref_dir)) if len(out_info) != 2: raise ValueError("Did not find genome reference for %s %s" % (genome_build, aligner)) else: return tuple(out_info)
def get_refs(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in (aligner, "samtools") if x]: galaxy_dt = _get_galaxy_data_table( name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file( name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config) base = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if os.path.isdir(base): indexes = glob.glob(os.path.join(base, "*")) else: indexes = glob.glob("%s*" % utils.splitext_plus(base)[0]) if base in indexes: indexes.remove(base) out[name_remap.get(name, name)] = { "base": base, "indexes": indexes } return out
def get_refs(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ if not genome_build: return (None, None) galaxy_config = _get_galaxy_tool_info(galaxy_base) out_info = [] for name in [aligner, "samtools"]: if not name: out_info.append(None) continue galaxy_dt = _get_galaxy_data_table( name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file( name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap) out_info.append( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if len(out_info) != 2: raise ValueError("Did not find genome reference for %s %s" % (genome_build, aligner)) else: return tuple(out_info)
def get_refs(genome_build, aligner, galaxy_base, data): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in ("samtools", aligner) if x]: galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data) base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if os.path.isdir(base): indexes = glob.glob(os.path.join(base, "*")) elif name != "samtools": indexes = glob.glob("%s*" % utils.splitext_plus(base)[0]) else: indexes = [] out[name_remap.get(name, name)] = {} if os.path.exists(base) and os.path.isfile(base): out[name_remap.get(name, name)]["base"] = base if indexes: out[name_remap.get(name, name)]["indexes"] = indexes # add additional indices relative to the base if tz.get_in(["fasta", "base"], out): ref_dir, ref_filebase = os.path.split(out["fasta"]["base"]) out["rtg"] = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) return out
def move_to_storage(lane, bc_id, fc_dir, select_files, cur_galaxy_files, config, config_file, fname_out=None): """Create directory for long term storage before linking to Galaxy. """ galaxy_config_file = utils.add_full_path(config["galaxy_config"], os.path.dirname(config_file)) galaxy_conf = ConfigParser.SafeConfigParser({'here': ''}) galaxy_conf.read(galaxy_config_file) try: lib_import_dir = galaxy_conf.get("app:main", "library_import_dir") except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): raise ValueError( "Galaxy config %s needs library_import_dir to be set." % galaxy_config_file) storage_dir = _get_storage_dir(fc_dir, lane, bc_id, os.path.join(lib_import_dir, "storage"), fname_out) existing_files = [os.path.basename(f['name']) for f in cur_galaxy_files] need_upload = False for orig_file, new_file in select_files: if new_file not in existing_files: new_file = os.path.join(storage_dir, new_file) if not os.path.exists(new_file): shutil.copy(orig_file, new_file) need_upload = True return (storage_dir if need_upload else None)
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build ] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: logger.info("Downloading %s %s from AWS" % (genome_build, name)) cur_ref = download_prepped_genome(genome_build, data, name, need_remap) # allow multiple references in a file and use the most recently added else: cur_ref = refs[-1] if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build] remap_fn = alignment.TOOLS[name].remap_index_fn need_remap = remap_fn is not None if len(refs) == 0: # if we have an S3 connection, try to download try: import boto boto.connect_s3() except: raise ValueError("Could not find reference genome file %s %s" % (genome_build, name)) logger.info("Downloading %s %s from AWS" % (genome_build, name)) cur_ref = _download_prepped_genome(genome_build, data, name, need_remap) # allow multiple references in a file and use the most recently added else: cur_ref = refs[-1] if need_remap: assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def move_to_storage(lane, bc_id, fc_dir, select_files, cur_galaxy_files, config, config_file): """Create directory for long term storage before linking to Galaxy. """ galaxy_config_file = utils.add_full_path(config["galaxy_config"], os.path.dirname(config_file)) galaxy_conf = ConfigParser.SafeConfigParser({'here' : ''}) galaxy_conf.read(galaxy_config_file) try: lib_import_dir = galaxy_conf.get("app:main", "library_import_dir") except (ConfigParser.NoOptionError, ConfigParser.NoSectionError): raise ValueError("Galaxy config %s needs library_import_dir to be set." % galaxy_config_file) storage_dir = _get_storage_dir(fc_dir, lane, bc_id, os.path.join(lib_import_dir, "storage")) existing_files = [os.path.basename(f['name']) for f in cur_galaxy_files] need_upload = False for orig_file, new_file in select_files: if new_file in existing_files: need_upload = False break else: new_file = os.path.join(storage_dir, new_file) if not os.path.exists(new_file): shutil.copy(orig_file, new_file) need_upload = True return (storage_dir if need_upload else None)
def get_rseqc_graphs(self): final_graphs = [] for f, caption, size in self.GRAPHS: final_f = add_full_path(os.path.join(self._dir, f)) if file_exists(final_f): final_graphs.append((final_f, caption, size)) return final_graphs
def get_refs(genome_build, aligner, galaxy_base, data): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in ("samtools", aligner) if x]: galaxy_dt = _get_galaxy_data_table( name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file( name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data) base = os.path.normpath( utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if os.path.isdir(base): indexes = sorted(glob.glob(os.path.join(base, "*"))) elif name != "samtools": indexes = sorted( glob.glob("%s*" % utils.splitext_plus(base)[0])) else: indexes = [] name = name_remap.get(name, name) out[name] = {} if os.path.exists(base) and os.path.isfile(base): out[name]["base"] = base if indexes: out[name]["indexes"] = indexes # For references, add compressed inputs and indexes if they exist if name == "fasta" and "base" in out[name] and os.path.exists( out[name]["base"] + ".gz"): indexes = [ out[name]["base"] + ".gz.fai", out[name]["base"] + ".gz.gzi", utils.splitext_plus(out[name]["base"])[0] + ".dict" ] out[name + "gz"] = { "base": out[name]["base"] + ".gz", "indexes": [x for x in indexes if os.path.exists(x)] } # add additional indices relative to the base if tz.get_in(["fasta", "base"], out): ref_dir, ref_filebase = os.path.split(out["fasta"]["base"]) out["rtg"] = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) twobit = os.path.normpath( os.path.join(ref_dir, os.path.pardir, "ucsc", "%s.2bit" % (os.path.splitext(ref_filebase)[0]))) if os.path.exists(twobit): out["twobit"] = twobit return out
def add_multiplex_across_lanes(run_items, fastq_dir, fc_name): """Add multiplex information to control and non-multiplexed lanes. Illumina runs include barcode reads for non-multiplex lanes, and the control, when run on a multiplexed flow cell. This checks for this situation and adds details to trim off the extra bases. """ fastq_dir = utils.add_full_path(fastq_dir) # determine if we have multiplexes and collect expected size fastq_sizes = [] tag_sizes = [] has_barcodes = False for xs in run_items: if len(xs) > 1: has_barcodes = True tag_sizes.extend([len(x["sequence"]) for x in xs]) fastq_sizes.append(_get_fastq_size(xs[0], fastq_dir, fc_name)) if not has_barcodes: # nothing to worry about return run_items fastq_sizes = list(set(fastq_sizes)) # discard 0 sizes to handle the case where lane(s) are empty or failed try: fastq_sizes.remove(0) except ValueError: pass tag_sizes = list(set(tag_sizes)) final_items = [] for xs in run_items: if len(xs) == 1 and xs[0]["barcode_id"] is None: assert len(fastq_sizes) == 1, \ "Multi and non-multiplex reads with multiple sizes" expected_size = fastq_sizes[0] assert len(tag_sizes) == 1, \ "Expect identical tag size for a flowcell" tag_size = tag_sizes[0] this_size = _get_fastq_size(xs[0], fastq_dir, fc_name) if this_size == expected_size: x = xs[0] x["barcode_id"] = "trim" x["sequence"] = "N" * tag_size xs = [x] else: assert this_size == expected_size - tag_size, \ "Unexpected non-multiplex sequence" final_items.append(xs) return final_items
def main(system_config_file, cur_config_file): config = utils.merge_config_files([system_config_file, cur_config_file]) run_module = "bcbio.hbc.linker" trim_vals = config["algorithm"]["simple_trims"] fastq_dir = utils.add_full_path(config["dir"]["fastq"]) cur_files = [ os.path.join(fastq_dir, x["file"]) for x in config["experiments"] ] dirs = { "config": utils.add_full_path(os.path.dirname(system_config_file)), "work": os.getcwd(), "align": utils.add_full_path(config["dir"]["align"]) } dirs["galaxy"] = os.path.dirname( utils.add_full_path(config["galaxy_config"], dirs["config"])) config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"]) config["dir"]["fastq"] = fastq_dir config["dir"]["work_fastq"] = utils.add_full_path( config["dir"]["work_fastq"]) run_parallel = parallel_runner(run_module, dirs, config, system_config_file) aligned = [] for i in range(len(trim_vals.values()[0])): print cur_files in_args = [(f, i, trim_vals, config) for f in cur_files] align_trimmed_files = run_parallel("trim_with_aligner", in_args) cur_files = [ x["unaligned"] for x in align_trimmed_files if x["unaligned"] ] aligned.append([x["aligned"] for x in align_trimmed_files]) trimmed_fastq = combine_aligned(aligned, config) align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel) count_files = count_targets(align_bams, config) combine.identify_top_ranked(count_files, config)
def main(system_config_file, cur_config_file): config = utils.merge_config_files([system_config_file, cur_config_file]) run_module = "bcbio.hbc.linker" trim_vals = config["algorithm"]["simple_trims"] fastq_dir = utils.add_full_path(config["dir"]["fastq"]) cur_files = [os.path.join(fastq_dir, x["file"]) for x in config["experiments"]] dirs = {"config": utils.add_full_path(os.path.dirname(system_config_file)), "work" : os.getcwd(), "align": utils.add_full_path(config["dir"]["align"])} dirs["galaxy"] = os.path.dirname(utils.add_full_path(config["galaxy_config"], dirs["config"])) config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"]) config["dir"]["fastq"] = fastq_dir config["dir"]["work_fastq"] = utils.add_full_path(config["dir"]["work_fastq"]) run_parallel = parallel_runner(run_module, dirs, config, system_config_file) aligned = [] for i in range(len(trim_vals.values()[0])): print cur_files in_args = [(f, i, trim_vals, config) for f in cur_files] align_trimmed_files = run_parallel("trim_with_aligner", in_args) cur_files = [x["unaligned"] for x in align_trimmed_files if x["unaligned"]] aligned.append([x["aligned"] for x in align_trimmed_files]) trimmed_fastq = combine_aligned(aligned, config) align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel) count_files = count_targets(align_bams, config) combine.identify_top_ranked(count_files, config)
def combine_aligned(aligned, config): """Combine aligned sequences into final output files. """ trimmed = [] out_dir = utils.safe_makedir(utils.add_full_path(config["dir"]["final"])) for i, fname in enumerate([x["file"] for x in config["experiments"]]): # write to output file out_fname = os.path.join(out_dir, "{0}-trim.fastq".format( os.path.splitext(os.path.basename(fname))[0])) if not utils.file_exists(out_fname): with open(out_fname, "w") as out_handle: for in_fname in [xs[i] for xs in aligned]: with open(in_fname) as in_handle: out_handle.writelines(in_handle) trimmed.append(out_fname) return trimmed
def get_refs(genome_build, aligner, galaxy_base, data): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in ("samtools", aligner) if x]: galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data) base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) # Expand directories unless we are an aligner like minimap2 that uses the seq directory if os.path.isdir(base) and not (need_remap and os.path.basename(base) == "seq"): indexes = sorted(glob.glob(os.path.join(base, "*"))) elif name != "samtools": indexes = sorted(glob.glob("%s*" % utils.splitext_plus(base)[0])) else: indexes = [] name = name_remap.get(name, name) out[name] = {} if os.path.exists(base) and os.path.isfile(base): out[name]["base"] = base if indexes: out[name]["indexes"] = indexes # For references, add compressed inputs and indexes if they exist if name == "fasta" and "base" in out[name] and os.path.exists(out[name]["base"] + ".gz"): indexes = [out[name]["base"] + ".gz.fai", out[name]["base"] + ".gz.gzi", utils.splitext_plus(out[name]["base"])[0] + ".dict"] out[name + "gz"] = {"base": out[name]["base"] + ".gz", "indexes": [x for x in indexes if os.path.exists(x)]} # add additional indices relative to the base if tz.get_in(["fasta", "base"], out): ref_dir, ref_filebase = os.path.split(out["fasta"]["base"]) rtg_dir = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "rtg", "%s.sdf" % (os.path.splitext(ref_filebase)[0]))) out["rtg"] = {"base": os.path.join(rtg_dir, "mainIndex"), "indexes": [x for x in glob.glob(os.path.join(rtg_dir, "*")) if not x.endswith("/mainIndex")]} twobit = os.path.normpath(os.path.join(ref_dir, os.path.pardir, "ucsc", "%s.2bit" % (os.path.splitext(ref_filebase)[0]))) if os.path.exists(twobit): out["twobit"] = twobit return out
def combine_aligned(aligned, config): """Combine aligned sequences into final output files. """ trimmed = [] out_dir = utils.safe_makedir(utils.add_full_path(config["dir"]["final"])) for i, fname in enumerate([x["file"] for x in config["experiments"]]): # write to output file out_fname = os.path.join( out_dir, "{0}-trim.fastq".format( os.path.splitext(os.path.basename(fname))[0])) if not utils.file_exists(out_fname): with open(out_fname, "w") as out_handle: for in_fname in [xs[i] for xs in aligned]: with open(in_fname) as in_handle: out_handle.writelines(in_handle) trimmed.append(out_fname) return trimmed
def get_refs(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in (aligner, "samtools") if x]: galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap) base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) indexes = glob.glob("%s*" % utils.splitext_plus(base)[0]) if base in indexes: indexes.remove(base) out[name_remap.get(name, name)] = {"base": base, "indexes": indexes} return out
def _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config): """Retrieve reference genome file from Galaxy *.loc file. Reads from tool_data_table_conf.xml information for the index if it exists, otherwise uses heuristics to find line based on most common setups. """ refs = [ref for dbkey, ref in _galaxy_loc_iter(loc_file, galaxy_dt, need_remap) if dbkey == genome_build] if len(refs) == 0: raise IndexError("Genome %s not found in %s" % (genome_build, loc_file)) # allow multiple references in a file and use the most recently added else: cur_ref = refs[-1] if need_remap: remap_fn = alignment.TOOLS[name].remap_index_fn cur_ref = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) assert remap_fn is not None, "%s requires remapping function from base location file" % name cur_ref = remap_fn(os.path.abspath(cur_ref)) return cur_ref
def get_refs(genome_build, aligner, galaxy_base, data): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in ("samtools", aligner) if x]: galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config, data) base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if os.path.isdir(base): indexes = glob.glob(os.path.join(base, "*")) else: indexes = glob.glob("%s*" % utils.splitext_plus(base)[0]) out[name_remap.get(name, name)] = {"indexes": indexes} if os.path.exists(base) and os.path.isfile(base): out[name_remap.get(name, name)]["base"] = base return out
def get_genome_ref(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ if not genome_build: return (None, None) galaxy_config = _get_galaxy_tool_info(galaxy_base) out_info = [] for name in [aligner, "samtools"]: if not name: out_info.append(None) continue galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap) out_info.append(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if len(out_info) != 2: raise ValueError("Did not find genome reference for %s %s" % (genome_build, aligner)) else: return tuple(out_info)