def _do_validate_hg_prefix(makefile, prefix, contigs, fatal): if not _is_invalid_hg_prefix(contigs): return message = \ "Prefix appears to be a human genome, but chromosomes are ordered\n" \ "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \ "(chr1, chr2, chr3, ...):\n\n" \ " Makefile = %s\n" \ " Prefix = %s\n\n" \ "GATK requires that human chromosomes are ordered numerically;\n%s\n" \ "See the documentation at the GATK website for more information:\n " \ "http://www.broadinstitute.org/gatk/guide/article?id=1204\n" prefix_path = prefix["Path"] mkfile_path = makefile["Statistics"]["Filename"] if fatal: details = "Either disable GATK in the makefile, or fix the prefix." message %= (mkfile_path, prefix_path, details) raise MakefileError(message) else: details = \ "You will not be able to use the resulting BAM file with GATK." message %= (mkfile_path, prefix_path, details) print_warn("\nWARNING:\n", message, sep="")
def _validate_makefiles_duplicate_files(makefiles): filenames = collections.defaultdict(list) for makefile in makefiles: iterator = _iterate_over_records(makefile) for (target, sample, library, barcode, record) in iterator: current_filenames = [] if record["Type"] == "Raw": for raw_filenames in record["Data"].itervalues(): current_filenames.extend(raw_filenames) else: current_filenames.extend(record["Data"].values()) for realpath in map(os.path.realpath, current_filenames): filenames[realpath].append((target, sample, library, barcode)) has_overlap = {} for (filename, records) in filenames.iteritems(): if len(records) > 1: has_overlap[filename] = list(set(records)) by_records = sorted(zip(has_overlap.values(), has_overlap.keys())) for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]): pairs = list(pairs) description = _describe_files_in_multiple_records(records, pairs) if len(set(record[0] for record in records)) != len(records): message = "Path included multiple times in target:\n" raise MakefileError(message + description) else: print_warn("WARNING: Path included in multiple targets:\n%s\n" % (description, ))
def build_mito_nodes(config, root, bamfile, dependencies=()): if config.database.mitochondria is None: print_warn("WARNING: Zonkey database %r does not contain " "mitochondrial sequences; cannot analyze MT BAM %r!\n" % (config.tablefile, bamfile)) return () samples = os.path.join(root, "figures", "samples.txt") mt_prefix = os.path.join(root, "results", "mitochondria", "sequences") alignment = mitochondria.MitoConsensusNode(database=config.tablefile, bamfile=bamfile, output_prefix=mt_prefix, dependencies=dependencies) raxml_template = os.path.join(root, "results", "mitochondria", "raxml_%s") phylo = RAxMLRapidBSNode.customize(input_alignment=mt_prefix + ".phy", output_template=raxml_template, dependencies=(alignment,)) phylo.command.set_option("-N", 100) phylo.command.set_option("-m", "GTRGAMMA") phylo = phylo.build_node() output_prefix = os.path.join(root, "figures", "mitochondria", "mito_phylo") trees = mitochondria.DrawPhylogenyNode(samples=samples, treefile=raxml_template % ("bestTree",), bootstraps=raxml_template % ("bootstrap",), output_prefix=output_prefix, dependencies=(phylo,)) return (trees,)
def _validate_makefiles_duplicate_files(makefiles): filenames = collections.defaultdict(list) for makefile in makefiles: iterator = _iterate_over_records(makefile) for (target, sample, library, barcode, record) in iterator: current_filenames = [] if record["Type"] == "Raw": for raw_filenames in record["Data"].itervalues(): current_filenames.extend(raw_filenames) else: current_filenames.extend(record["Data"].values()) for realpath in map(os.path.realpath, current_filenames): filenames[realpath].append((target, sample, library, barcode)) has_overlap = {} for (filename, records) in filenames.iteritems(): if len(records) > 1: has_overlap[filename] = list(set(records)) by_records = sorted(zip(has_overlap.values(), has_overlap.keys())) for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]): pairs = list(pairs) description = _describe_files_in_multiple_records(records, pairs) if len(set(record[0] for record in records)) != len(records): message = "Path included multiple times in target:\n" raise MakefileError(message + description) else: print_warn("WARNING: Path included in multiple targets:\n%s\n" % (description,))
def _update_filtering(mkfile): samples = mkfile["Project"]["Samples"] groups = mkfile["Project"]["Groups"] filtering = {} for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems(): if target.startswith("<") and target.endswith(">"): raise MakefileError("Singleton-filtering must be specified per " "sample, not by groups: %r" % (target,)) elif target not in samples: raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,)) elif target in filter_by: raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,)) path = "Project:FilterSingletons:%s" % (target,) filtering[target] = _select_samples(filter_by, groups, samples, path) # Implicit inclusion is allowed, since that is useful in some cases, # where we want to filter a sample based on the group it is a member of if target in filtering[target]: # The target itself must be excluded, as including it is invalid filtering[target] = filtering[target] - set((target,)) print_warn("Warning: Sample %r is singleton-filtered using a " "group it is also a member of; this may be by mistake." % (target,)) if not filtering[target]: raise MakefileError("No samples specified by which to " "singleton-filter by for %r" % (target,)) mkfile["Project"]["FilterSingletons"] = filtering
def _check_genders(mkfile): all_contigs = set() contigs_genders = set() regions_genders = set() for regions in mkfile["Project"]["Regions"].itervalues(): all_contigs.update(_collect_fasta_contigs(regions["FASTA"])) for contigs in regions["HomozygousContigs"].itervalues(): contigs_genders.update(contigs) current_genders = set(regions["HomozygousContigs"]) if not regions_genders: regions_genders = current_genders elif regions_genders != current_genders: raise MakefileError("List of genders for regions %r does not " "match other regions" % (regions["Name"],)) if not regions_genders: raise MakefileError("No genders have been specified in makefile; " "please list all sample genders and assosiated " "homozygous contigs (if any).") for sample in mkfile["Project"]["Samples"].itervalues(): if sample["Gender"] not in regions_genders: genders = ", ".join(map(repr, regions_genders)) message = "Sample %r has unknown gender %r; known genders are %s" \ % (sample["Name"], sample["Gender"], genders) raise MakefileError(message) unknown_contigs = contigs_genders - all_contigs if unknown_contigs: print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n - " + "\n - ".join(unknown_contigs)) print_warn("Please verify that the list(s) of contigs is correct!")
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path, )) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path, )) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error, )) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _validate_prefixes(makefiles): """Validates prefixes and regions-of-interest, including an implementation of the checks included in GATK, which require that the FASTA for the human genome is ordered 1 .. 23. This is required since GATK will not run with human genomes in a different order. """ already_validated = {} print_info(" - Validating prefixes ...") for makefile in makefiles: uses_gatk = makefile["Options"]["Features"]["RealignedBAM"] for prefix in makefile["Prefixes"].itervalues(): path = prefix["Path"] if path in already_validated: prefix["IndexFormat"] = already_validated[path]["IndexFormat"] continue # Must be set to a valid value, even if FASTA file does not exist prefix["IndexFormat"] = ".bai" if not os.path.exists(path): print_warn(" - Reference FASTA file does not exist:\n" " %r" % (path,)) continue elif not os.path.exists(path + ".fai"): print_info(" - Index does not exist for %r; this may " "take a while ..." % (path,)) try: contigs = FASTA.index_and_collect_contigs(path) except FASTAError, error: raise MakefileError("Error indexing FASTA:\n %s" % (error,)) # Implementation of GATK checks for the human genome _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk) contigs = dict(contigs) regions_of_interest = prefix.get("RegionsOfInterest", {}) for (name, fpath) in regions_of_interest.iteritems(): try: # read_bed_file returns iterator for _ in bedtools.read_bed_file(fpath, contigs=contigs): pass except (bedtools.BEDError, IOError), error: raise MakefileError("Error reading regions-of-" "interest %r for prefix %r:\n%s" % (name, prefix["Name"], error)) if max(contigs.itervalues()) > _BAM_MAX_SEQUENCE_LENGTH: print_warn(" - FASTA file %r contains sequences longer " "than %i! CSI index files will be used instead " "of BAI index files." % (path, _BAM_MAX_SEQUENCE_LENGTH)) prefix["IndexFormat"] = ".csi" already_validated[path] = prefix
def _check_sexes(mkfile): all_contigs = set() contigs_sexes = set() regions_sexes = set() for regions in mkfile["Project"]["Regions"].itervalues(): all_contigs.update(_collect_fasta_contigs(regions["FASTA"])) for contigs in regions["HomozygousContigs"].itervalues(): contigs_sexes.update(contigs) current_sexes = set(regions["HomozygousContigs"]) if not regions_sexes: regions_sexes = current_sexes elif regions_sexes != current_sexes: raise MakefileError("List of sexes for regions %r does not " "match other regions" % (regions["Name"],)) if not regions_sexes: raise MakefileError("No sexes have been specified in makefile; " "please list all sample sexes and assosiated " "homozygous contigs (if any).") for sample in mkfile["Project"]["Samples"].itervalues(): if sample.get("Sex") is None: if sample.get("Gender") is None: raise MakefileError("Please specify a sex for sample %r, or " "'NA' if not applicable." % (sample["Name"])) sample["Sex"] = sample.pop("Gender") elif sample.get("Gender") is not None: raise MakefileError("Both a Sex and a Gender has been specified " "sample %r; the Gender field is deprecated, " "please only use the Sex field." % (sample["Name"])) if sample["Sex"] not in regions_sexes: sexes = ", ".join(map(repr, regions_sexes)) message = "Sample %r has unknown sex %r; known sexes are %s" \ % (sample["Name"], sample["Sex"], sexes) raise MakefileError(message) unknown_contigs = contigs_sexes - all_contigs if unknown_contigs: print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n" " - " + "\n - ".join(unknown_contigs)) print_warn("Please verify that the list(s) of contigs is correct!")
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn( "WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
def _validate_makefile_adapters(makefile): """Checks for the default adapter sequences specified in the wrong orientation for AdapterRemoval, which is a typical mistake when using the --pcr2 option. """ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT" tests = { # --pcr2 expects the reverse complement of the mate 2 adapter seq. "--pcr2": adapter_2, # --adapter2 (AdapterRemoval v2) expects the regular sequence "--adapter2": sequences.reverse_complement(adapter_2) } def check_options(options, results): for key, value in tests.iteritems(): if options.get(key) == value: results[key] = True results = dict.fromkeys(tests, False) for (_, _, _, _, record) in _iterate_over_records(makefile): adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {}) check_options(adapterrm_opt, results) if any(results.itervalues()): print_warn("WARNING: An adapter specified for AdapterRemoval " "corresponds to the default sequence, but is reverse " "complemented. Please make sure that this is intended! ", end="") if results["--pcr2"]: print_warn("For --pcr2, the sequence given should be the " "reverse complement of the sequence observed in the " "mate 2 FASTQ file.\n") if results["--adapter2"]: print_warn("For --adapter2 (AdapterRemoval v2, only) the value " "should be exactly as observed in the FASTQ reads.\n")
name = cand_sample.split('.', 1)[0] name_mapping[name] = cand_sample name_counts[name] = name_counts.get(name, 0) + 1 if name_mapping.get(sample) == 1: # Sample name (with some extensions) found # This is typical if 'paleomix depths' has been run manually. max_depth = max_depths[name_mapping[sample]] elif len(max_depths) == 1: # Just one sampel in the depth histogram; even though it does not # match, we assuem that this is the correct table. This is because # manually generating files / renaming files would otherwise cause # failure when using 'MaxDepth: auto'. (cand_sample, max_depth), = max_depths.items() print_warn(" - Name in depths file not as expected; " "found %r, not %r:" % (cand_sample, sample)) if max_depth is None: raise MakefileError("MaxDepth for %r not found in depth-histogram: %r" % (sample, filename)) elif max_depth == "NA": raise MakefileError("MaxDepth is not calculated for sample %r; " "cannot determine MaxDepth values automatically." % (filename,)) elif not max_depth.isdigit(): raise MakefileError("MaxDepth is not a valid for sample %r in %r; " "expected integer, found %r." % (sample, filename, max_depth)) max_depth = int(max_depth)