def test_index_and_collect_contigs__fai_files(tmp_path): fasta_file = tmp_path / "test.fasta" with fasta_file.open("wt") as handle: _TEST_FASTA_1_A.write(handle) fai_file = tmp_path / "test.fasta.fai" # Fai file should be created once, and then not modified FASTA.index_and_collect_contigs(fasta_file) stats_1 = fai_file.stat() FASTA.index_and_collect_contigs(fasta_file) stats_2 = fai_file.stat() assert stats_1 == stats_2
def _collect_fasta_contigs(filename, cache={}): if filename in cache: return cache[filename] if not os.path.exists(filename + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (filename,)) cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename)) return contigs
def _collect_fasta_contigs(filename, cache={}): if filename in cache: return cache[filename] if not os.path.exists(filename + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (filename,)) cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename)) return contigs
def _collect_fasta_contigs(filename, cache={}): if filename in cache: return cache[filename] if not os.path.exists(filename + ".fai"): log = logging.getLogger(__name__) log.info("Indexing %r; this may take a while", filename) cache[filename] = contigs = FASTA.index_and_collect_contigs(filename) return contigs
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path, )) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path, )) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path,)) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path,)) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error,)) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def test_index_and_collect_contigs__duplicate_names(tmp_path): fasta_file = tmp_path / "test.fasta" with fasta_file.open("wt") as handle: _TEST_FASTA_1_A.write(handle) _TEST_FASTA_2.write(handle) _TEST_FASTA_1_B.write(handle) assert FASTA.index_and_collect_contigs(fasta_file) == { "seq1": 25, "seq2": 7, }
def _stat_prefixes(cls, prefixes): """Returns (size, number of contigs) for a set of BWA prefix.""" genomes = {} for prefix in prefixes: contigs = FASTA.index_and_collect_contigs(prefixes[prefix]["Reference"]) genomes[prefix] = { "Size": sum(contigs.values()), "NContigs": len(contigs), } return genomes
def _validate_prefixes(makefiles): logger = logging.getLogger(__name__) already_validated = {} logger.info("Validating FASTA files") for makefile in makefiles: for prefix in makefile["Prefixes"].values(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): logger.warn("Reference FASTA file does not exist: %r", path) continue elif not os.path.exists(path + ".fai"): logger.info("Indexing FASTA at %r", path) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError as error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.items(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError) as error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.values()) > _BAM_MAX_SEQUENCE_LENGTH: logger.warn( "FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files.", path, _BAM_MAX_SEQUENCE_LENGTH, ) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix