def _collect_and_validate_regions(regions): contigs = _collect_fasta_contigs(regions) sequences = set() with open(regions["BED"]) as bedhandle: for (line_num, line) in enumerate(bedhandle): line = line.strip() if not line or line.startswith("#"): continue try: bed = BEDRecord(line) except ValueError, error: raise MakefileError( ("Error parsing line %i in regions file:\n" " Path = %r\n Line = %r\n\n%s") % (line_num + 1, regions["BED"], line, error)) if len(bed) < 6: url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1" name = repr(bed.name) if len(bed) > 3 else "unnamed record" raise MakefileError(("Region at line #%i (%s) does not " "contain the expected number of fields; " "the first 6 fields are required. C.f. " "defination at\n %s\n\nPath = %r") % (line_num, name, url, regions["BED"])) contig_len = contigs.get(bed.contig) if contig_len is None: raise MakefileError(("Regions file contains contig not found " "in reference:\n Path = %r\n Contig = " "%r\n\nPlease ensure that all contig " "names match the reference names!") % (regions["BED"], bed.contig)) elif not (0 <= bed.start < bed.end <= contig_len): raise MakefileError(("Regions file contains invalid region:\n" " Path = %r\n Contig = %r\n" " Start = %s\n End = %s\n\n" "Expected 0 <= Start < End <= %i!") % (regions["BED"], bed.contig, bed.start, bed.end, contig_len)) sequences.add(bed.name)
def _stat_areas_of_interest(cls, prefixes): """Returns (size, number of named intervals, total number of intervals) for a set of areas of interest.""" areas_of_interest = {} for (prefix_name, prefix) in prefixes.iteritems(): prefix_label = prefix.get("Label", prefix_name) for (roi_name, roi_filename) in prefix.get("RegionsOfInterest", {}).iteritems(): count, names, size = 0, set(), 0 with open(roi_filename) as handle: for line in handle: bed = BEDRecord(line) names.add(bed.name if len(bed) >= 4 else (bed.contig + "*")) size += (bed.end - bed.start) count += 1 areas_of_interest[(prefix_name, roi_name)] = {"Size" : size, "NFeatures" : len(names), "NIntervals" : count, "Genome" : prefix["Name"], "Name" : roi_name, "Label" : "%s:%s" % (prefix_label, roi_name), "Path" : roi_filename} return areas_of_interest