def _build_table_rows(args): rows = [] for filename in args.makefile: for (target, sample, library, lane, path) in _parse_makefile(filename): if isinstance(path, dict): ui.print_err("WARNING: Found pre-processed data " "at %s:%s:%s:%s; cannot collect raw " "FASTQ data." % (target, sample, library, lane)) continue row = { "sample_alias": "*", "instrument_model": "*", "library_source": "GENOMIC", "library_selection": "RANDOM", "library_strategy": "WGS", "design_description": "", "library_construction_protocol": "", "insert_size": "0", "MAKEFILE_TARGET": target, "MAKEFILE_SAMPLE": sample, "MAKEFILE_LIBRARY": library, "MAKEFILE_LANE": lane, "MAKEFILE_PATH": path } rows.append(row) return rows
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename,)) samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, line.rstrip('\r\n').split('\t')) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i; " "expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] if name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = {"Root": os.path.join(config.destination, name), "Files": fields[1:]} return True
def _build_table_rows(args): rows = [] for filename in args.makefile: for (target, sample, library, lane, path) in _parse_makefile(filename): if isinstance(path, dict): ui.print_err("WARNING: Found pre-processed data " "at %s:%s:%s:%s; cannot collect raw " "FASTQ data." % (target, sample, library, lane)) continue row = {"sample_alias": "*", "instrument_model": "*", "library_source": "GENOMIC", "library_selection": "RANDOM", "library_strategy": "WGS", "design_description": "", "library_construction_protocol": "", "insert_size": "0", "MAKEFILE_TARGET": target, "MAKEFILE_SAMPLE": sample, "MAKEFILE_LIBRARY": library, "MAKEFILE_LANE": lane, "MAKEFILE_PATH": path} rows.append(row) return rows
def main(argv): """Main function; takes a list of arguments but excluding sys.argv[0].""" args = parse_args(argv) try: return args.function(args) except ENAError, error: ui.print_err("FATAL ERROR:\n %s" % (error, ))
def main(argv): """Main function; takes a list of arguments but excluding sys.argv[0].""" args = parse_args(argv) try: return args.function(args) except ENAError, error: ui.print_err("FATAL ERROR:\n %s" % (error,))
def main_wrapper(process_func, argv, ext): args = parse_arguments(argv, ext) args.regions = None if args.regions_fpath: try: args.regions = collect_bed_regions(args.regions_fpath) except ValueError, error: print_err("ERROR: Failed to parse BED file %r:\n%s" % (args.regions_fpath, error)) return 1
def main(argv): args = parse_args(argv) data = database.ZonkeyDB(args.database) sequences = data.mitochondria try: handle = pysam.Samfile(args.bam) except (IOError, ValueError), error: ui.print_err("Error reading BAM file: %s" % (error,)) return 1
def parse_run_config(config, args): if not (2 <= len(args) <= 4): print_usage() return config.multisample = False config.tablefile = args[0] try: config.database = database.ZonkeyDB(config.tablefile) except database.ZonkeyDBError, error: print_err("ERROR reading database %r: %s" % (config.tablefile, error)) return
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) self.datadup_check = self._build_dataduplication_node( prefix, files_and_nodes) build_raw_bam = features["RawBAM"] build_realigned_bam = features["RealignedBAM"] if build_realigned_bam and prefix['IndexFormat'] == '.csi': if prefix['Path'] not in _CSI_WARNINGS: ui.print_err("\nWARNING: Realigned BAMs enabled for reference " "genome %r, but the file contains sequences too " "large for GATK, which does not support .csi " "index files. Raw BAMs will be built instead of " "realigned BAMs, for this reference sequence." % (prefix['Path'])) # TODO: Add reference to FAQ when written. _CSI_WARNINGS.add(prefix['Path']) build_realigned_bam = False build_raw_bam = True self.bams = {} if build_raw_bam: self.bams.update( self._build_raw_bam(config, prefix, files_and_nodes)) if build_realigned_bam: self.bams.update( self._build_realigned_bam(config, prefix, files_and_nodes)) if not self.bams: for sample in self.samples: self.bams.update(sample.bams) nodes = [self.datadup_check] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def validate_bam(self, filename): """Validates a sample BAM file, checking that it is either a valid mitochondrial BAM (aligned against one of the referenc mt sequences), or that it is a valid nuclear BAM (aligned against the reference). Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE. """ print_info(" - Validating BAM file %r ... " % (filename,)) try: handle = pysam.Samfile(filename) except (ValueError, IOError), error: print_err("Error reading BAM: %s" % (error,)) return
def validate_bam(self, filename): """Validates a sample BAM file, checking that it is either a valid mitochondrial BAM (aligned against one of the referenc mt sequences), or that it is a valid nuclear BAM (aligned against the reference). Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE. """ print_info(" - Validating BAM file %r ... " % (filename, )) try: handle = pysam.Samfile(filename) except (ValueError, IOError), error: print_err("Error reading BAM: %s" % (error, )) return
def __init__(self, config, prefix, samples, features, target): self.name = prefix["Name"] self.label = prefix.get("Label") or self.name self.roi = prefix.get("RegionsOfInterest", {}) self.samples = safe_coerce_to_tuple(samples) self.folder = config.destination self.target = target files_and_nodes = {} for sample in self.samples: files_and_nodes.update(sample.bams.iteritems()) self.datadup_check = self._build_dataduplication_node( prefix, files_and_nodes) build_raw_bam = features["RawBAM"] build_realigned_bam = features["RealignedBAM"] if build_realigned_bam and prefix['IndexFormat'] == '.csi': if prefix['Path'] not in _CSI_WARNINGS: ui.print_err("\nWARNING: Realigned BAMs enabled for reference " "genome %r, but the file contains sequences too " "large for GATK, which does not support .csi " "index files. Raw BAMs will be built instead of " "realigned BAMs, for this reference sequence." % (prefix['Path'])) # TODO: Add reference to FAQ when written. _CSI_WARNINGS.add(prefix['Path']) build_realigned_bam = False build_raw_bam = True self.bams = {} if build_raw_bam: self.bams.update(self._build_raw_bam( config, prefix, files_and_nodes)) if build_realigned_bam: self.bams.update(self._build_realigned_bam( config, prefix, files_and_nodes)) if not self.bams: for sample in self.samples: self.bams.update(sample.bams) nodes = [self.datadup_check] for sample in self.samples: nodes.extend(sample.nodes) self.nodes = tuple(nodes)
def _validate_mito_bam(data, handle, info): if data.mitochondria is None: # No mitochondrial data .. skip phylogeny return True references = handle.references min_length = min( (len(record.sequence)) for record in data.mitochondria.itervalues()) for bam_contig, bam_length in zip(references, handle.lengths): if bam_contig not in data.mitochondria: continue db_sequence = data.mitochondria[bam_contig].sequence db_length = len(db_sequence) - db_sequence.count("-") if bam_length != db_length: print_err("ERROR: Length of mitochondrial contig %r (%i bp) " "does not match the length of the corresponding " "sequence in the database (%i bp)" % (bam_contig, bam_length, db_length)) return False if not os.path.exists(handle.filename + '.bai') \ and not os.path.exists(swap_ext(handle.filename, '.bai')): print_info(' - Attempting to index BAM file %r!' % (handle.filename, )) pysam.index(handle.filename) # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(handle.filename)).split('\n'): line = line.strip() if not line: continue name, _, hits, _ = line.split('\t') if (name == bam_contig) and not int(hits): print_err("WARNING: Mitochondrial BAM (%r) does not contain " "any reads aligned to contig %r; inferring an " "phylogeny is not possible." % (handle.filename, name)) return True info.mt_contig = bam_contig info.mt_length = bam_length info.mt_padding = len(db_sequence) - min_length return True return True
def _validate_mito_bam(data, handle, info): if data.mitochondria is None: # No mitochondrial data .. skip phylogeny return True references = handle.references min_length = min((len(record.sequence)) for record in data.mitochondria.itervalues()) for bam_contig, bam_length in zip(references, handle.lengths): if bam_contig not in data.mitochondria: continue db_sequence = data.mitochondria[bam_contig].sequence db_length = len(db_sequence) - db_sequence.count("-") if bam_length != db_length: print_err("ERROR: Length of mitochondrial contig %r (%i bp) " "does not match the length of the corresponding " "sequence in the database (%i bp)" % (bam_contig, bam_length, db_length)) return False if not os.path.exists(handle.filename + '.bai') \ and not os.path.exists(swap_ext(handle.filename, '.bai')): print_info(' - Attempting to index BAM file %r!' % (handle.filename,)) pysam.index(handle.filename) # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(handle.filename)).split('\n'): line = line.strip() if not line: continue name, _, hits, _ = line.split('\t') if (name == bam_contig) and not int(hits): print_err("WARNING: Mitochondrial BAM (%r) does not contain " "any reads aligned to contig %r; inferring an " "phylogeny is not possible." % (handle.filename, name)) return True info.mt_contig = bam_contig info.mt_length = bam_length info.mt_padding = len(db_sequence) - min_length return True return True
def parse_config(argv): config, args = _parse_arguments(argv) if not args: print_usage() return config.command = _CMD_ALIASES.get(args[0]) if config.command is None: print_err("ERROR: Unknown command %r" % (args[0], )) return elif config.command == "dryrun": config.command = "run" config.dry_run = True return parse_run_config(config, args[1:])
def parse_config(argv): config, args = _parse_arguments(argv) if not args: print_usage() return config.command = _CMD_ALIASES.get(args[0]) if config.command is None: print_err("ERROR: Unknown command %r" % (args[0],)) return elif config.command == "dryrun": config.command = "run" config.dry_run = True return parse_run_config(config, args[1:])
def _validate_nuclear_bam(data, handle, info): # Check that chromosomes are of expected size; unused chroms are ignored. bam_contigs = dict( zip(map(contig_name_to_plink_name, handle.references), handle.lengths)) ref_contigs = data.contigs contigs_found = {} for name, stats in sorted(ref_contigs.iteritems()): if name not in bam_contigs: contigs_found[name] = False elif bam_contigs[name] != stats["Size"]: print_err("\nERROR: Chrom %r in the BAM does not match the " "length specified in data file:\n" " - Expected: %i\n" " - Found: %i" % (name, bam_contigs[name], stats["Size"])) return False else: contigs_found[name] = True if any(contigs_found.itervalues()): if not all(contigs_found.itervalues()): print_err("\nERROR: Not all nuclear chromosomes found in BAM:") for (name, stats) in sorted(ref_contigs.iteritems()): is_found = "Found" if contigs_found[name] else "Not found!" print_err(" - %s: %s" % (name, is_found)) return False else: info.nuclear = True return True
def _validate_nuclear_bam(data, handle, info): # Check that chromosomes are of expected size; unused chroms are ignored. bam_contigs = dict(zip(map(contig_name_to_plink_name, handle.references), handle.lengths)) ref_contigs = data.contigs contigs_found = {} for name, stats in sorted(ref_contigs.iteritems()): if name not in bam_contigs: contigs_found[name] = False elif bam_contigs[name] != stats["Size"]: print_err("\nERROR: Chrom %r in the BAM does not match the " "length specified in data file:\n" " - Expected: %i\n" " - Found: %i" % (name, bam_contigs[name], stats["Size"])) return False else: contigs_found[name] = True if any(contigs_found.itervalues()): if not all(contigs_found.itervalues()): print_err("\nERROR: Not all nuclear chromosomes found in BAM:") for (name, stats) in sorted(ref_contigs.iteritems()): is_found = "Found" if contigs_found[name] else "Not found!" print_err(" - %s: %s" % (name, is_found)) return False else: info.nuclear = True return True
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename, )) valid_characters = frozenset(string.letters + string.digits + ".-_") samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, map(str.strip, line.split('\t'))) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i: " "Expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] invalid_letters = frozenset(name) - valid_characters if invalid_letters: print_err("Error reading sample table (%r) at line %i: " "Sample name contains illegal character(s). Only " "letters, numbers, and '-', '_', and '.' are " "allowed, but found %r in name %r " % (filename, linenum, "".join(invalid_letters), name)) return elif name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = { "Root": os.path.join(config.destination, name), "Files": fields[1:] } return True
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename,)) valid_characters = frozenset(string.letters + string.digits + ".-_") samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, map(str.strip, line.split('\t'))) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i: " "Expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] invalid_letters = frozenset(name) - valid_characters if invalid_letters: print_err("Error reading sample table (%r) at line %i: " "Sample name contains illegal character(s). Only " "letters, numbers, and '-', '_', and '.' are " "allowed, but found %r in name %r " % (filename, linenum, "".join(invalid_letters), name)) return elif name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = {"Root": os.path.join(config.destination, name), "Files": fields[1:]} return True
args = parse_args(argv) data = database.ZonkeyDB(args.database) sequences = data.mitochondria try: handle = pysam.Samfile(args.bam) except (IOError, ValueError), error: ui.print_err("Error reading BAM file: %s" % (error,)) return 1 with handle: bam_info = data.validate_bam_handle(handle) if bam_info is None: return 1 elif not bam_info.is_mitochondrial: ui.print_err("ERROR: BAM does not contain any known mitochondrial " "sequence found in BAM ..") return 1 reference = sequences[bam_info.mt_contig] stats, majority = majority_sequence(handle, padding=bam_info.mt_padding, contig_name=bam_info.mt_contig, contig_length=bam_info.mt_length) sequences["Sample"] = FASTA(name="Sample", meta=None, sequence=align_majority(reference.sequence, majority)) # Truncate all sequences to match the (now) unpadded sample sequence sequences = truncate_sequences(sequences, "Sample")
def _process_samples(config): for name, info in sorted(config.samples.items()): files = {} if name == "-": print_info("Validating unnamed sample ...") else: print_info("Validating sample %r ..." % (name, )) for filename in info.pop("Files"): filetype = config.database.validate_bam(filename) if not filetype: print_err("ERROR: File is not a valid BAM file: %r" % (filename, )) return False if filetype.is_nuclear and filetype.is_mitochondrial: if "Nuc" in files: print_err("ERROR: Two nuclear BAMs specified!") return False elif "Mito" in files: print_err("WARNING: Nuclear + mitochondrial BAM, and " "mitochondrial BAM specified; the mitochondrial " "genome in the first BAM will not be used!") files["Nuc"] = filename files.setdefault("Mito", filename) elif filetype.is_nuclear: if "Nuc" in files: print_err("ERROR: Two nuclear BAMs specified!") return False files["Nuc"] = filename elif filetype.is_mitochondrial: if "Mito" in files: print_err("ERROR: Two nuclear BAMs specified!") return False files["Mito"] = filename else: print_err("ERROR: BAM does not contain usable nuclear " "or mitochondrial contigs: %r" % (filename, )) return False config.samples[name]["Files"] = files return True
config.multisample = False config.tablefile = args[0] try: config.database = database.ZonkeyDB(config.tablefile) except database.ZonkeyDBError, error: print_err("ERROR reading database %r: %s" % (config.tablefile, error)) return known_samples = set(config.database.samples) | set(("Sample", )) unknown_samples = set(config.treemix_outgroup) - known_samples if unknown_samples: print_err("ERROR: Argument --treemix-outgroup includes unknown " "sample(s): %s; known samples are %s. Note that " "names are case-sensitive." % (", ".join(map(repr, sorted(unknown_samples))), ", ".join( map(repr, sorted(known_samples))))) return if config.command in ("mito", "example"): if len(args) != 2: print_err("ERROR: Wrong number of arguments!") print_usage() return config.destination = args[1] config.samples = {} elif len(args) == 2: filename = args[1] config.destination = fileutils.swap_ext(filename, ".zonkey")
try: args.regions = collect_bed_regions(args.regions_fpath) except ValueError, error: print_err("ERROR: Failed to parse BED file %r:\n%s" % (args.regions_fpath, error)) return 1 print_msg("Opening %r" % (args.infile, )) with pysam.Samfile(args.infile) as handle: sort_order = handle.header.get('HD', {}).get('SO') if sort_order is None: print_warn("WARNING: BAM file %r is not marked as sorted!" % (args.infile, )) elif sort_order != 'coordinate': print_err("ERROR: BAM file %r is %s-sorted, but only " "coordinate-sorted BAMs are supported!" % (args.infile, sort_order)) return 1 sort_bed_by_bamfile(handle, args.regions) return process_func(handle, args) def _get_readgroup(record): try: return record.get_tag("RG") except KeyError: return None def _get_readgroup_ignored(_):
try: args.regions = collect_bed_regions(args.regions_fpath) except ValueError, error: print_err("ERROR: Failed to parse BED file %r:\n%s" % (args.regions_fpath, error)) return 1 print_msg("Opening %r" % (args.infile,)) with pysam.Samfile(args.infile) as handle: sort_order = handle.header.get('HD', {}).get('SO') if sort_order is None: print_warn("WARNING: BAM file %r is not marked as sorted!" % (args.infile,)) elif sort_order != 'coordinate': print_err("ERROR: BAM file %r is %s-sorted, but only " "coordinate-sorted BAMs are supported!" % (args.infile, sort_order)) return 1 sort_bed_by_bamfile(handle, args.regions) return process_func(handle, args) def _get_readgroup(record): try: return record.get_tag("RG") except KeyError: return None def _get_readgroup_ignored(_):
def _process_samples(config): for name, info in sorted(config.samples.items()): files = {} if name == "-": print_info("Validating unnamed sample ...") else: print_info("Validating sample %r ..." % (name,)) for filename in info.pop("Files"): filetype = config.database.validate_bam(filename) if not filetype: print_err("ERROR: File is not a valid BAM file: %r" % (filename,)) return False if filetype.is_nuclear and filetype.is_mitochondrial: if "Nuc" in files: print_err("ERROR: Two nuclear BAMs specified!") return False elif "Mito" in files: print_err("WARNING: Nuclear + mitochondrial BAM, and " "mitochondrial BAM specified; the mitochondrial " "genome in the first BAM will not be used!") files["Nuc"] = filename files.setdefault("Mito", filename) elif filetype.is_nuclear: if "Nuc" in files: print_err("ERROR: Two nuclear BAMs specified!") return False files["Nuc"] = filename elif filetype.is_mitochondrial: if "Mito" in files: print_err("ERROR: Two nuclear BAMs specified!") return False files["Mito"] = filename else: print_err("ERROR: BAM does not contain usable nuclear " "or mitochondrial contigs: %r" % (filename,)) return False config.samples[name]["Files"] = files return True
config.multisample = False config.tablefile = args[0] try: config.database = database.ZonkeyDB(config.tablefile) except database.ZonkeyDBError, error: print_err("ERROR reading database %r: %s" % (config.tablefile, error)) return known_samples = set(config.database.samples) | set(("Sample",)) unknown_samples = set(config.treemix_outgroup) - known_samples if unknown_samples: print_err("ERROR: Argument --treemix-outgroup includes unknown " "sample(s): %s; known samples are %s. Note that " "names are case-sensitive." % (", ".join(map(repr, sorted(unknown_samples))), ", ".join(map(repr, sorted(known_samples))))) return if config.command in ("mito", "example"): if len(args) != 2: print_err("ERROR: Wrong number of arguments!") print_usage() return config.destination = args[1] config.samples = {} elif len(args) == 2: filename = args[1] config.destination = fileutils.swap_ext(filename, ".zonkey")