def test_append(): for ext in ["", ".gz"]: # BZ2 does NOT support append text = "AB" reference = text + text filename = 'truncated.fastq' + ext mode = 'a' if ext != "": mode = 'ab' text = text.encode() reference = text + text text = get_compressor(filename).compress( text) # On Py3, need to send BYTES, not unicode print("Trying ext=%s" % ext) with temporary_path(filename) as path: try: os.unlink(path) except OSError: pass with open_output(path, mode) as f: f.write(text) print(path) with open_output(path, mode) as f: f.write(text) with xopen(path, 'r') as f: try: reference = reference.decode("utf-8") except AttributeError: pass for appended in f: assert appended == reference
def error(options, parser): from atropos.error import ( BaseQualityErrorEstimator, ShadowRegressionErrorEstimator, PairedErrorEstimator) batch_iterator, names, qualities, _ = create_reader( options, parser, counter_magnitude="K") try: if not qualities: parser.error("Cannot estimate error rate without base qualities") if options.algorithm == 'quality': estimator_class = BaseQualityErrorEstimator elif options.algorithm == 'shadow': estimator_class = ShadowRegressionErrorEstimator if options.paired: e = PairedErrorEstimator( max_read_len=options.max_bases, estimator_class=estimator_class) else: e = estimator_class(max_read_len=options.max_bases) e.consume_all_batches(batch_iterator) finally: batch_iterator.close() with open_output(options.output) as o: e.summarize(o, names) return (0, None, {})
def error(options, parser): from atropos.error import ( BaseQualityErrorEstimator, ShadowRegressionErrorEstimator, PairedErrorEstimator) batch_iterator, names, qualities = create_reader( options, parser, counter_magnitude="K")[0:3] try: if not qualities: parser.error("Cannot estimate error rate without base qualities") if options.algorithm == 'quality': estimator_class = BaseQualityErrorEstimator elif options.algorithm == 'shadow': estimator_class = ShadowRegressionErrorEstimator if options.paired: e = PairedErrorEstimator( max_read_len=options.max_bases, estimator_class=estimator_class) else: e = estimator_class(max_read_len=options.max_bases) e.consume_all_batches(batch_iterator) finally: batch_iterator.close() with open_output(options.output) as o: e.summarize(o, names) return (0, None, {})
def close(self): for path in self.force_create: if path not in self.writers and path != STDOUT: with open_output(path, "w"): pass for writer in self.writers.values(): if writer != sys.stdout: writer.close()
def detect(options, parser): from atropos.detect import ( summarize_contaminants, PairedDetector, KnownContaminantDetector, HeuristicDetector, KhmerDetector) from atropos.util import enumerate_range k = options.kmer_size or 12 n_reads = options.max_reads overrep_cutoff = 100 include = options.include_contaminants or "all" known_contaminants = load_known_adapters(options) if include != 'unknown' else None batch_iterator, names, _, _ = create_reader(options, parser, counter_magnitude="K") detector = options.detector if not detector: if known_contaminants and include == 'known': detector = 'known' elif n_reads <= 50000: detector = 'heuristic' else: detector = 'khmer' if detector == 'known': logging.getLogger().debug("Detecting contaminants using the known-only algorithm") detector_class = KnownContaminantDetector elif detector == 'heuristic': logging.getLogger().debug("Detecting contaminants using the heuristic algorithm") detector_class = HeuristicDetector elif detector == 'khmer': logging.getLogger().debug("Detecting contaminants using the kmer-based algorithm") detector_class = KhmerDetector try: detector_args = dict( k=k, n_reads=n_reads, overrep_cutoff=overrep_cutoff, known_contaminants=known_contaminants) if options.paired: d = PairedDetector(detector_class, **detector_args) else: d = detector_class(**detector_args) names = names[0] with open_output(options.output) as o: print("\nDetecting adapters and other potential contaminant sequences based on " "{}-mers in {} reads".format(k, n_reads), file=o) d.consume_all_batches(batch_iterator) d.summarize(o, names, include=include) finally: batch_iterator.close() return (0, None, {})
def detect(options, parser): from atropos.detect import ( summarize_contaminants, PairedDetector, KnownContaminantDetector, HeuristicDetector, KhmerDetector) from atropos.util import enumerate_range k = options.kmer_size or 12 n_reads = options.max_reads overrep_cutoff = 100 include = options.include_contaminants or "all" known_contaminants = load_known_adapters(options) if include != 'unknown' else None batch_iterator, names = create_reader(options, parser, counter_magnitude="K")[0:2] detector = options.detector if not detector: if known_contaminants and include == 'known': detector = 'known' elif n_reads <= 50000: detector = 'heuristic' else: detector = 'khmer' if detector == 'known': logging.getLogger().debug("Detecting contaminants using the known-only algorithm") detector_class = KnownContaminantDetector elif detector == 'heuristic': logging.getLogger().debug("Detecting contaminants using the heuristic algorithm") detector_class = HeuristicDetector elif detector == 'khmer': logging.getLogger().debug("Detecting contaminants using the kmer-based algorithm") detector_class = KhmerDetector try: detector_args = dict( k=k, n_reads=n_reads, overrep_cutoff=overrep_cutoff, known_contaminants=known_contaminants) if options.paired: d = PairedDetector(detector_class, **detector_args) else: d = detector_class(**detector_args) names = names[0] with open_output(options.output) as o: print("\nDetecting adapters and other potential contaminant sequences based on " "{}-mers in {} reads".format(k, n_reads), file=o) d.consume_all_batches(batch_iterator) d.summarize(o, names, include=include) finally: batch_iterator.close() return (0, None, {})
def main(): parser = argparse.ArgumentParser() parser.add_argument("-1", "--fastq1") parser.add_argument("-2", "--fastq2") parser.add_argument("-o", "--output", default="-") args = parser.parse_args() with xopen(args.fastq1) as fq1, xopen(args.fastq2) as fq2: hists = make_hists(fq1, fq2) with open_output(args.output) as o: w = csv.writer(o, delimiter="\t") w.writerow(('read', 'side', 'pos', 'base', 'count')) for i, h in enumerate(hists, 1): for j in range(2): for b in nuc: for k, count in enumerate(h[j][b], 1): w.writerow((i, j, k, b, count))
def write(self, file_desc, data, compressed=False): """ Write data to file. """ if compressed: path, mode = file_desc else: path = file_desc if path not in self.writers: if self.suffix: real_path = add_suffix_to_path(path, self.suffix) else: real_path = path # TODO: test whether O_NONBLOCK allows non-blocking write to NFS if compressed: self.writers[path] = open_output(real_path, mode) else: self.writers[path] = xopen(real_path, "w") self.writers[path].write(data)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-a", "--adapter1", default=ADAPTER1) parser.add_argument("-A", "--adapter2", default=ADAPTER2) parser.add_argument("-1", "--fastq1") parser.add_argument("-2", "--fastq2") parser.add_argument("-o", "--output", default="-") args = parser.parse_args() with xopen(args.fastq1) as fq1, xopen(args.fastq2) as fq2: metrics = estimate_metrics(fq1, fq2, args.adapter1, args.adapter2) with open_output(args.output) as o: print("Avg error prob: {}".format(metrics[0]), file=o) print("Read 1 with full-length adapters: {}".format(metrics[1]), file=o) print("Read 1 full-length adapter bases: {}".format(metrics[2]), file=o) print("Read 2 with full-length adapters: {}".format(metrics[3]), file=o) print("Read 2 full-length adapter bases: {}".format(metrics[4]), file=o)
def main(): parser = argparse.ArgumentParser() parser.set_defaults(command=None) parser.add_argument("-d", "--bam-dir") parser.add_argument("-x", "--bam-extension", default=".sorted.bam") parser.add_argument("-p", "--bam-pattern", default=None) parser.add_argument("-u", "--untrimmed-name", default="untrimmed") parser.add_argument("-o", "--output", default="-") parser.add_argument("-H", "--hist", default="trimmed_hists.txt") parser.add_argument("-m", "--max-reads", type=int, default=None) parser.add_argument("--no-edit-distance", action="store_false", default=True, dest="edit_distance", help="Don't try to match by editdistance.") parser.add_argument("--no-progress", action="store_false", dest="progress", default=True) sp = parser.add_subparsers() amplicon = sp.add_parser('amplicon') amplicon.set_defaults(command='amplicon') amplicon.add_argument( "-b", "--bed", default=None, help="Sorted .bed file of regions where reads should map.") amplicon.add_argument( "--min-overlap", type=float, default=1.0, help="When a .bed file is specified, this is the minimum " "fraction of a mapped read that must overlap a selected " "interval for that mapping to be considered valid. (1.0)") amplicon.add_argument( "--slop", type=int, default=None, help="When a .bed file is specified, this is the number of bp each " "region is extended. This is often necessary with amplicon data " "because enrichment can capture sequences that only paritally " "overlap the probes.") mrna = sp.add_parser('mrna') mrna.set_defaults(command='mrna') mrna.add_argument( "-D", "--bed-dir", default=None, help="Directory where to find annotation bed files. Defaults to " "--bam-dir.") mrna.add_argument( "-B", "--bed-pattern", default='{name}.bed', help="String template for bed file names. \{name\} is replaced with the " "BAM file name with extension (--bam-extension) removed.") args = parser.parse_args() if args.edit_distance: import editdistance trimmed = {} untrimmed = None pattern = (args.bam_pattern or "*{}").format(args.bam_extension) for path in glob(os.path.join(args.bam_dir, pattern)): name = os.path.basename(path)[:-len(args.bam_extension)] if name == args.untrimmed_name: untrimmed = BAMReader(path) else: trimmed[name] = BAMReader(path) if untrimmed is None: untrimmed = BAMReader(os.path.join( args.bam_dir, "{}{}".format(args.untrimmed_name, args.bam_extension))) regions = None if args.command == 'amplicon': regions = Bed(args.bed, args.slop or 200) elif args.command == 'mrna': regions = Annotations( args.bed_dir or args.bam_dir, args.bed_pattern, args.untrimmed_name, trimmed.keys()) try: with open_output(args.output) as o, open_output(args.hist) as h: ow = csv.writer(o, delimiter="\t") write_header(ow) hw = csv.writer(h, delimiter="\t") hw.writerow(('prog','read', 'side', 'pos', 'base', 'count')) summarize(untrimmed, trimmed, ow, hw, mode=args.command, regions=regions, max_reads=args.max_reads, use_edit_distance=args.edit_distance, progress=args.progress) finally: if untrimmed: untrimmed.close() for t in trimmed.values(): t.close() if regions: regions.close()
def main(): parser = argparse.ArgumentParser() parser.set_defaults(command=None) parser.add_argument("-d", "--bam-dir") parser.add_argument("-x", "--bam-extension", default=".sorted.bam") parser.add_argument("-p", "--bam-pattern", default=None) parser.add_argument("-u", "--untrimmed-name", default="untrimmed") parser.add_argument("-o", "--output", default="-") parser.add_argument("-H", "--hist", default="trimmed_hists.txt") parser.add_argument("-m", "--max-reads", type=int, default=None) parser.add_argument("--no-edit-distance", action="store_false", default=True, dest="edit_distance", help="Don't try to match by editdistance.") parser.add_argument("--no-progress", action="store_false", dest="progress", default=True) sp = parser.add_subparsers() amplicon = sp.add_parser('amplicon') amplicon.set_defaults(command='amplicon') amplicon.add_argument( "-b", "--bed", default=None, help="Sorted .bed file of regions where reads should map.") amplicon.add_argument( "--min-overlap", type=float, default=1.0, help="When a .bed file is specified, this is the minimum " "fraction of a mapped read that must overlap a selected " "interval for that mapping to be considered valid. (1.0)") amplicon.add_argument( "--slop", type=int, default=None, help="When a .bed file is specified, this is the number of bp each " "region is extended. This is often necessary with amplicon data " "because enrichment can capture sequences that only paritally " "overlap the probes.") mrna = sp.add_parser('mrna') mrna.set_defaults(command='mrna') mrna.add_argument( "-D", "--bed-dir", default=None, help="Directory where to find annotation bed files. Defaults to " "--bam-dir.") mrna.add_argument( "-B", "--bed-pattern", default='{name}.bed', help="String template for bed file names. \{name\} is replaced with the " "BAM file name with extension (--bam-extension) removed.") args = parser.parse_args() if args.edit_distance: import editdistance trimmed = {} untrimmed = None pattern = (args.bam_pattern or "*{}").format(args.bam_extension) for path in glob(os.path.join(args.bam_dir, pattern)): name = os.path.basename(path)[:-len(args.bam_extension)] if name == args.untrimmed_name: untrimmed = BAMReader(path) else: trimmed[name] = BAMReader(path) if untrimmed is None: untrimmed = BAMReader( os.path.join( args.bam_dir, "{}{}".format(args.untrimmed_name, args.bam_extension))) regions = None if args.command == 'amplicon': regions = Bed(args.bed, args.slop or 200) elif args.command == 'mrna': regions = Annotations(args.bed_dir or args.bam_dir, args.bed_pattern, args.untrimmed_name, trimmed.keys()) try: with open_output(args.output) as o, open_output(args.hist) as h: ow = csv.writer(o, delimiter="\t") write_header(ow) hw = csv.writer(h, delimiter="\t") hw.writerow(('prog', 'read', 'side', 'pos', 'base', 'count')) summarize(untrimmed, trimmed, ow, hw, mode=args.command, regions=regions, max_reads=args.max_reads, use_edit_distance=args.edit_distance, progress=args.progress) finally: if untrimmed: untrimmed.close() for t in trimmed.values(): t.close() if regions: regions.close()