def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename,)) samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, line.rstrip('\r\n').split('\t')) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i; " "expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] if name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = {"Root": os.path.join(config.destination, name), "Files": fields[1:]} return True
def main(argv): args = parse_arguments(argv) scaffolds = {} if args.scaffolds: print("Reading scaffolds information from %r" % (args.scaffolds, )) scaffolds = read_scaffolds(args.scaffolds) with open_ro(args.infile, "rb") as gtf_file: print("Reading GTF from %r" % (args.infile, )) src_table = read_gtf(gtf_file, scaffolds, args.contig_prefix) for (source, table) in src_table.items(): print("Writing tables for '%s'" % source) if source.startswith("protein"): features = build_coding_seqs_table(args, table) else: features = build_noncoding_seqs_table(args, table) for feature in features: fpath = "%s.%s.%s.bed" % (args.output_prefix, source, feature) print("\tWriting %ss to '%s'" % (feature, fpath)) write_bed(features[feature], fpath) return 0
def main(argv): args = parse_arguments(argv) scaffolds = {} if args.scaffolds: print("Reading scaffolds information from %r" % (args.scaffolds,)) scaffolds = read_scaffolds(args.scaffolds) with open_ro(args.infile) as gtf_file: print("Reading GTF from %r" % (args.infile,)) src_table = read_gtf(gtf_file, scaffolds, args.contig_prefix) for (source, table) in src_table.iteritems(): print("Writing tables for '%s' ..." % source) if source.startswith("protein"): features = build_coding_seqs_table(args, table) else: features = build_noncoding_seqs_table(args, table) for feature in features: fpath = "%s.%s.%s.bed" % (args.output_prefix, source, feature) print("\tWriting %ss to '%s' ..." % (feature, fpath, )) write_bed(features[feature], fpath) return 0
def read_bed_file(filename, min_columns=3, contigs=None): """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of records. Comments and empty lines are skipped. If the number of columns in the bed record is less than the specified ('min_columns'), a BEDError is raised. If a dictionary of {contig: length} is supplied, and min_columns is at least 6, then the coordinates are validated against the known contig lengths. """ if min_columns < 3: raise ValueError("'min_columns' must be >= 3 in 'read_bed_file'") infinite = float("inf") handle = None try: handle = open_ro(filename) for (line_num, line) in enumerate(handle): line = line.strip() if not line or line.startswith("#"): continue try: bed = BEDRecord(line) except ValueError as error: raise BEDError("Error parsing line %i in regions file:\n" " Path = %r\n Line = %r\n\n%s" % (line_num + 1, filename, line, error)) if len(bed) < min_columns: url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1" name = repr(bed.name) if len(bed) > 3 else "unnamed record" raise BEDError("Region at line #%i (%s) does not " "contain the expected number of fields; " "the first %i fields are required. C.f. " "defination at\n %s\n\nPath = %r" % (line_num, name, min_columns, url, filename)) if contigs is None: contig_len = infinite else: contig_len = contigs.get(bed.contig) if contig_len is None: raise BEDError("Regions file contains contig not found " "in reference:\n Path = %r\n Contig = " "%r\n\nPlease ensure that all contig " "names match the reference names!" % (filename, bed.contig)) elif not (0 <= bed.start < bed.end <= contig_len): raise BEDError( "Regions file contains invalid region:\n" " Path = %r\n Contig = %r\n" " Start = %s\n End = %s\n\n" "Expected 0 <= Start < End <= %i!" % (filename, bed.contig, bed.start, bed.end, contig_len)) yield bed finally: if handle: handle.close()
def test_open_ro__bz2(): handle = open_ro(test_file('fasta_file.fasta.bz2')) try: assert_equal(handle.read(), b'>This_is_BZ_FASTA!\nCGTNA\n' b'>This_is_ALSO_BZ_FASTA!\nACGTN\n') finally: handle.close()
def test_open_ro__gz(): handle = open_ro(test_file('fasta_file.fasta.gz')) try: assert_equal(handle.read(), b'>This_is_GZipped_FASTA!\nACGTN\n' b'>This_is_ALSO_GZipped_FASTA!\nCGTNA\n') finally: handle.close()
def _parse_freq_table(cls, filename): with fileutils.open_ro(filename) as handle: handle.readline() # Skip header for line in handle: chrom, snp, clst, _, _, _, mac, nchroms = line.split() yield (chrom, snp, clst, int(mac), int(nchroms))
def test_open_ro__gz(): handle = open_ro(test_file('fasta_file.fasta.gz')) try: assert_equal( handle.read(), b'>This_is_GZipped_FASTA!\nACGTN\n' b'>This_is_ALSO_GZipped_FASTA!\nCGTNA\n') finally: handle.close()
def from_file(cls, filename): """Reads a MSA from the specified filename. The file may be uncompressed, gzipped or bzipped. See also 'MSA.from_lines'.""" fasta_file = open_ro(filename) try: return MSA.from_lines(fasta_file) except MSAError, error: raise MSAError("%s in file %r" % (error, filename))
def test_open_ro__bz2(): handle = open_ro(test_file('fasta_file.fasta.bz2')) try: assert_equal( handle.read(), b'>This_is_BZ_FASTA!\nCGTNA\n' b'>This_is_ALSO_BZ_FASTA!\nACGTN\n') finally: handle.close()
def from_file(cls, filename): """Reads an unindexed FASTA file, returning a sequence of tuples containing the name and sequence of each entry in the file. The FASTA file may be GZIP/BZ2 compressed.""" fasta_file = open_ro(filename) try: for record in FASTA.from_lines(fasta_file): yield record finally: fasta_file.close()
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ log = logging.getLogger(__name__) log.info("Reading table of samples from %r", filename) valid_characters = frozenset(string.ascii_letters + string.digits + ".-_") samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = [_f for _f in map(str.strip, line.split("\t")) if _f] if len(fields) not in (2, 3): log.error( "Error reading sample table (%r) at line %i: Expected 2 or 3 " "columns, found %i; please correct file before continuing.", filename, linenum, len(fields), ) return name = fields[0] invalid_letters = frozenset(name) - valid_characters if invalid_letters: log.error( "Error reading sample table (%r) at line %i: Sample name contains " "illegal character(s). Only letters, numbers, and '-', '_', and " "'.' are allowed, but found %r in name %r ", filename, linenum, "".join(invalid_letters), name, ) return elif name in samples: log.error( "Duplicate name %r in sample table; names must be unique!", name) return samples[name] = { "Root": os.path.join(config.destination, name), "Files": fields[1:], } return True
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename, )) valid_characters = frozenset(string.letters + string.digits + ".-_") samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, map(str.strip, line.split('\t'))) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i: " "Expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] invalid_letters = frozenset(name) - valid_characters if invalid_letters: print_err("Error reading sample table (%r) at line %i: " "Sample name contains illegal character(s). Only " "letters, numbers, and '-', '_', and '.' are " "allowed, but found %r in name %r " % (filename, linenum, "".join(invalid_letters), name)) return elif name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = { "Root": os.path.join(config.destination, name), "Files": fields[1:] } return True
def _read_sample_table(config, filename): """Parses a 2 - 3 column tab-seperated table containing, on each row, a name to be used for a sample in the first row, and then the paths two either one or to two BAM files, which must represent a single nuclear or a single mitochondrial alignment (2 columns), or both (3 columns). """ print_info("Reading table of samples from %r" % (filename,)) valid_characters = frozenset(string.letters + string.digits + ".-_") samples = config.samples = {} with fileutils.open_ro(filename) as handle: for linenum, line in enumerate(handle, start=1): if not line.strip() or line.lstrip().startswith("#"): continue fields = filter(None, map(str.strip, line.split('\t'))) if len(fields) not in (2, 3): print_err("Error reading sample table (%r) at line %i: " "Expected 2 or 3 columns, found %i; please " "correct file before continuing." % (filename, linenum, len(fields))) return name = fields[0] invalid_letters = frozenset(name) - valid_characters if invalid_letters: print_err("Error reading sample table (%r) at line %i: " "Sample name contains illegal character(s). Only " "letters, numbers, and '-', '_', and '.' are " "allowed, but found %r in name %r " % (filename, linenum, "".join(invalid_letters), name)) return elif name in samples: print_err("Duplicate sample name found in sample table " "(%r) at line %i: %r. All sample names must " "be unique!" % (filename, linenum, name)) return samples[name] = {"Root": os.path.join(config.destination, name), "Files": fields[1:]} return True
def _read_files(args): in_header = True has_filters = False vcf_parser = pysam.asVCF() for filename in args.filenames: with open_ro(filename, "rb") as handle: for line in handle: if not line.startswith(b"#"): in_header = False line = line.rstrip(b"\n\r") vcf = vcf_parser(line, len(line)) if args.reset_filter: vcf.filter = "." yield vcf elif in_header: if not (line.startswith(b"##") or has_filters): has_filters = True for item in sorted( vcffilter.describe_filters(args).items()): print('##FILTER=<ID=%s,Description="%s">' % item) print(line.decode("utf-8"), end="")
def from_file(cls, filename): """Reads an unindexed FASTQ file, returning a sequence of tuples containing the name and sequence of each entry in the file. The FASTQ file may be GZIP/BZ2 compressed.""" with open_ro(filename) as handle: yield from FASTQ.from_lines(handle)
def read_bed_file(filename, min_columns=3, contigs=None): """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of records. Comments and empty lines are skipped. If the number of columns in the bed record is less than the specified ('min_columns'), a BEDError is raised. If a dictionary of {contig: length} is supplied, and min_columns is at least 6, then the coordinates are validated against the known contig lengths. """ if min_columns < 3: raise ValueError("'min_columns' must be >= 3 in 'read_bed_file'") infinite = float("inf") handle = None try: handle = fileutils.open_ro(filename) for (line_num, line) in enumerate(handle): line = line.strip() if not line or line.startswith("#"): continue try: bed = BEDRecord(line) except ValueError, error: raise BEDError("Error parsing line %i in regions file:\n" " Path = %r\n Line = %r\n\n%s" % (line_num + 1, filename, line, error)) if len(bed) < min_columns: url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1" name = repr(bed.name) if len(bed) > 3 else "unnamed record" raise BEDError("Region at line #%i (%s) does not " "contain the expected number of fields; " "the first %i fields are required. C.f. " "defination at\n %s\n\nPath = %r" % (line_num, name, min_columns, url, filename)) if contigs is None: contig_len = infinite else: contig_len = contigs.get(bed.contig) if contig_len is None: raise BEDError("Regions file contains contig not found " "in reference:\n Path = %r\n Contig = " "%r\n\nPlease ensure that all contig " "names match the reference names!" % (filename, bed.contig)) elif not (0 <= bed.start < bed.end <= contig_len): raise BEDError("Regions file contains invalid region:\n" " Path = %r\n Contig = %r\n" " Start = %s\n End = %s\n\n" "Expected 0 <= Start < End <= %i!" % (filename, bed.contig, bed.start, bed.end, contig_len)) yield bed finally: if handle: handle.close()