def _collect_sequence_names(bed_file, fasta_file, min_columns=6): contigs = _collect_fasta_contigs(fasta_file) sequences = {} for record in read_bed_file(bed_file, min_columns=6, contigs=contigs): current = (record.contig, record.strand) reference = sequences.setdefault(record.name, current) if current[0] != reference[0]: raise MakefileError("Regions in %r with the same name (%r) " "are located on different contigs (%r and " "%r); note that PALEOMIX assumes that " "regions with the same name constitute " "parts of a single consecutive sequence, " "which must therefore be located on one " "strand of a single sequence. Please " "rename one or more of these regions to" "continue." % (bed_file, record.name, current[0], reference[0])) elif current[1] != reference[1]: raise MakefileError("Regions in %r with the same name (%r) " "are located on different strands; note " "that PALEOMIX assumes that regions with " "the same name constitute parts of a " "single consecutive sequence, and that " "these must therefore be located on the " "same strand." % (bed_file, record.name,)) return frozenset(sequences)
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path, )) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path, )) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path,)) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path,)) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error,)) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def collect_bed_regions(filename): regions = [] name_cache = {} for record in read_bed_file(filename): if len(record) < 4: record.name = "%s*" % (record.contig,) record.contig = name_cache.get(record.contig, record.contig) record.name = name_cache.get(record.name, record.name) regions.append(record.freeze()) return regions
def collect_bed_regions(filename): regions = [] name_cache = {} for record in read_bed_file(filename): if len(record) < 4: record.name = "%s*" % (record.contig, ) record.contig = name_cache.get(record.contig, record.contig) record.name = name_cache.get(record.name, record.name) regions.append(record.freeze()) return regions
def collect_regions(bedfile, bam_input_handle): """Returns the regions to be genotyped / pileup'd, as a list of bed-regions in the form (contig, start, end), where start is zero-based, and end is open based. """ if bedfile is not None: regions = list(read_bed_file(bedfile)) sort_bed_by_bamfile(bam_input_handle, regions) regions = merge_bed_regions(regions) else: regions = [] for (name, length) in zip(bam_input_handle.references, bam_input_handle.lengths): regions.append((name, 0, length)) return regions
def _validate_prefixes(makefiles): logger = logging.getLogger(__name__) already_validated = {} logger.info("Validating FASTA files") for makefile in makefiles: for prefix in makefile["Prefixes"].values(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): logger.warn("Reference FASTA file does not exist: %r", path) continue elif not os.path.exists(path + ".fai"): logger.info("Indexing FASTA at %r", path) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError as error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.items(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError) as error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.values()) > _BAM_MAX_SEQUENCE_LENGTH: logger.warn( "FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files.", path, _BAM_MAX_SEQUENCE_LENGTH, ) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _run(self, config, temp): contigs = {} with open(self._fai_file) as handle: for line in handle: name, length, _ = line.split('\t', 2) if name in contigs: raise NodeError('Reference genome contains multiple ' 'identically named contigs (%r)!' % (name, )) contigs[name] = int(length) with open(reroot_path(temp, self._outfile), 'w') as handle: for record in read_bed_file(self._infile, contigs=contigs): max_length = contigs[record.contig] record.start = max(0, record.start - self._amount) record.end = min(record.end + self._amount, max_length) handle.write('%s\n' % (record, ))
def _run(self, config, temp): contigs = {} with open(self._fai_file) as handle: for line in handle: name, length, _ = line.split('\t', 2) if name in contigs: raise NodeError('Reference genome contains multiple ' 'identically named contigs (%r)!' % (name,)) contigs[name] = int(length) with open(reroot_path(temp, self._outfile), 'w') as handle: for record in read_bed_file(self._infile, contigs=contigs): max_length = contigs[record.contig] record.start = max(0, record.start - self._amount) record.end = min(record.end + self._amount, max_length) handle.write('%s\n' % (record,))
def _run(self, config, temp): contigs = {} with open(self._fai_file) as handle: for line in handle: name, length, _ = line.split("\t", 2) if name in contigs: raise NodeError("Reference genome contains multiple " "identically named contigs (%r)!" % (name, )) contigs[name] = int(length) with open(reroot_path(temp, self._outfile), "w") as handle: records = list(read_bed_file(self._infile, contigs=contigs)) pad_bed_records(records=records, padding=self._amount, max_sizes=contigs) for record in merge_bed_records(records): handle.write("%s\n" % (record, ))