Beispiel #1
0
def test_append():
    for ext in ["", ".gz"]:  # BZ2 does NOT support append
        text = "AB"
        reference = text + text
        filename = 'truncated.fastq' + ext
        mode = 'a'
        if ext != "":
            mode = 'ab'
            text = text.encode()
            reference = text + text
            text = get_compressor(filename).compress(
                text)  # On Py3, need to send BYTES, not unicode
        print("Trying ext=%s" % ext)
        with temporary_path(filename) as path:
            try:
                os.unlink(path)
            except OSError:
                pass
            with open_output(path, mode) as f:
                f.write(text)
            print(path)
            with open_output(path, mode) as f:
                f.write(text)
            with xopen(path, 'r') as f:
                try:
                    reference = reference.decode("utf-8")
                except AttributeError:
                    pass
                for appended in f:
                    assert appended == reference
Beispiel #2
0
def error(options, parser):
    from atropos.error import (
        BaseQualityErrorEstimator, ShadowRegressionErrorEstimator,
        PairedErrorEstimator)
    
    batch_iterator, names, qualities, _ = create_reader(
        options, parser, counter_magnitude="K")
    try:
        if not qualities:
            parser.error("Cannot estimate error rate without base qualities")
        
        if options.algorithm == 'quality':
            estimator_class = BaseQualityErrorEstimator
        elif options.algorithm == 'shadow':
            estimator_class = ShadowRegressionErrorEstimator
        
        if options.paired:
            e = PairedErrorEstimator(
                max_read_len=options.max_bases,
                estimator_class=estimator_class)
        else:
            e = estimator_class(max_read_len=options.max_bases)
        
        e.consume_all_batches(batch_iterator)
    
    finally:
        batch_iterator.close()
    
    with open_output(options.output) as o:
        e.summarize(o, names)
    
    return (0, None, {})
Beispiel #3
0
def error(options, parser):
    from atropos.error import (
        BaseQualityErrorEstimator, ShadowRegressionErrorEstimator,
        PairedErrorEstimator)
    
    batch_iterator, names, qualities = create_reader(
        options, parser, counter_magnitude="K")[0:3]
    try:
        if not qualities:
            parser.error("Cannot estimate error rate without base qualities")
        
        if options.algorithm == 'quality':
            estimator_class = BaseQualityErrorEstimator
        elif options.algorithm == 'shadow':
            estimator_class = ShadowRegressionErrorEstimator
        
        if options.paired:
            e = PairedErrorEstimator(
                max_read_len=options.max_bases,
                estimator_class=estimator_class)
        else:
            e = estimator_class(max_read_len=options.max_bases)
        
        e.consume_all_batches(batch_iterator)
    
    finally:
        batch_iterator.close()
    
    with open_output(options.output) as o:
        e.summarize(o, names)
    
    return (0, None, {})
Beispiel #4
0
 def close(self):
     for path in self.force_create:
         if path not in self.writers and path != STDOUT:
             with open_output(path, "w"):
                 pass
     for writer in self.writers.values():
         if writer != sys.stdout:
             writer.close()
Beispiel #5
0
 def close(self):
     for path in self.force_create:
         if path not in self.writers and path != STDOUT:
             with open_output(path, "w"):
                 pass
     for writer in self.writers.values():
         if writer != sys.stdout:
             writer.close()
Beispiel #6
0
def detect(options, parser):
    from atropos.detect import (
        summarize_contaminants, PairedDetector, KnownContaminantDetector,
        HeuristicDetector, KhmerDetector)
    from atropos.util import enumerate_range
    
    k = options.kmer_size or 12
    n_reads = options.max_reads
    overrep_cutoff = 100
    include = options.include_contaminants or "all"
    known_contaminants = load_known_adapters(options) if include != 'unknown' else None
    batch_iterator, names, _, _ = create_reader(options, parser, counter_magnitude="K")
    
    detector = options.detector
    if not detector:
        if known_contaminants and include == 'known':
            detector = 'known'
        elif n_reads <= 50000:
            detector = 'heuristic'
        else:
            detector = 'khmer'
    
    if detector == 'known':
        logging.getLogger().debug("Detecting contaminants using the known-only algorithm")
        detector_class = KnownContaminantDetector
    elif detector == 'heuristic':
        logging.getLogger().debug("Detecting contaminants using the heuristic algorithm")
        detector_class = HeuristicDetector
    elif detector == 'khmer':
        logging.getLogger().debug("Detecting contaminants using the kmer-based algorithm")
        detector_class = KhmerDetector
    
    try:
        detector_args = dict(
            k=k, n_reads=n_reads, overrep_cutoff=overrep_cutoff,
            known_contaminants=known_contaminants)
        if options.paired:
            d = PairedDetector(detector_class, **detector_args)
        else:
            d = detector_class(**detector_args)
            names = names[0]
        
        with open_output(options.output) as o:
            print("\nDetecting adapters and other potential contaminant sequences based on "
                  "{}-mers in {} reads".format(k, n_reads), file=o)
            d.consume_all_batches(batch_iterator)
            d.summarize(o, names, include=include)
    finally:
        batch_iterator.close()
    
    return (0, None, {})
Beispiel #7
0
def detect(options, parser):
    from atropos.detect import (
        summarize_contaminants, PairedDetector, KnownContaminantDetector,
        HeuristicDetector, KhmerDetector)
    from atropos.util import enumerate_range
    
    k = options.kmer_size or 12
    n_reads = options.max_reads
    overrep_cutoff = 100
    include = options.include_contaminants or "all"
    known_contaminants = load_known_adapters(options) if include != 'unknown' else None
    batch_iterator, names = create_reader(options, parser, counter_magnitude="K")[0:2]
    
    detector = options.detector
    if not detector:
        if known_contaminants and include == 'known':
            detector = 'known'
        elif n_reads <= 50000:
            detector = 'heuristic'
        else:
            detector = 'khmer'
    
    if detector == 'known':
        logging.getLogger().debug("Detecting contaminants using the known-only algorithm")
        detector_class = KnownContaminantDetector
    elif detector == 'heuristic':
        logging.getLogger().debug("Detecting contaminants using the heuristic algorithm")
        detector_class = HeuristicDetector
    elif detector == 'khmer':
        logging.getLogger().debug("Detecting contaminants using the kmer-based algorithm")
        detector_class = KhmerDetector
    
    try:
        detector_args = dict(
            k=k, n_reads=n_reads, overrep_cutoff=overrep_cutoff,
            known_contaminants=known_contaminants)
        if options.paired:
            d = PairedDetector(detector_class, **detector_args)
        else:
            d = detector_class(**detector_args)
            names = names[0]
        
        with open_output(options.output) as o:
            print("\nDetecting adapters and other potential contaminant sequences based on "
                  "{}-mers in {} reads".format(k, n_reads), file=o)
            d.consume_all_batches(batch_iterator)
            d.summarize(o, names, include=include)
    finally:
        batch_iterator.close()
    
    return (0, None, {})
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-1", "--fastq1")
    parser.add_argument("-2", "--fastq2")
    parser.add_argument("-o", "--output", default="-")
    args = parser.parse_args()
    
    with xopen(args.fastq1) as fq1, xopen(args.fastq2) as fq2:
        hists = make_hists(fq1, fq2)
    
    with open_output(args.output) as o:
        w = csv.writer(o, delimiter="\t")
        w.writerow(('read', 'side', 'pos', 'base', 'count'))
        for i, h in enumerate(hists, 1):
            for j in range(2):
                for b in nuc:
                    for k, count in enumerate(h[j][b], 1):
                        w.writerow((i, j, k, b, count))
Beispiel #9
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-1", "--fastq1")
    parser.add_argument("-2", "--fastq2")
    parser.add_argument("-o", "--output", default="-")
    args = parser.parse_args()

    with xopen(args.fastq1) as fq1, xopen(args.fastq2) as fq2:
        hists = make_hists(fq1, fq2)

    with open_output(args.output) as o:
        w = csv.writer(o, delimiter="\t")
        w.writerow(('read', 'side', 'pos', 'base', 'count'))
        for i, h in enumerate(hists, 1):
            for j in range(2):
                for b in nuc:
                    for k, count in enumerate(h[j][b], 1):
                        w.writerow((i, j, k, b, count))
Beispiel #10
0
 def write(self, file_desc, data, compressed=False):
     """
     Write data to file.
     """
     if compressed:
         path, mode = file_desc
     else:
         path = file_desc
     if path not in self.writers:
         if self.suffix:
             real_path = add_suffix_to_path(path, self.suffix)
         else:
             real_path = path
         # TODO: test whether O_NONBLOCK allows non-blocking write to NFS
         if compressed:
             self.writers[path] = open_output(real_path, mode)
         else:
             self.writers[path] = xopen(real_path, "w")
     self.writers[path].write(data)
Beispiel #11
0
 def write(self, file_desc, data, compressed=False):
     """
     Write data to file.
     """
     if compressed:
         path, mode = file_desc
     else:
         path = file_desc
     if path not in self.writers:
         if self.suffix:
             real_path = add_suffix_to_path(path, self.suffix)
         else:
             real_path = path
         # TODO: test whether O_NONBLOCK allows non-blocking write to NFS
         if compressed:
             self.writers[path] = open_output(real_path, mode)
         else:
             self.writers[path] = xopen(real_path, "w")
     self.writers[path].write(data)
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-a", "--adapter1", default=ADAPTER1)
    parser.add_argument("-A", "--adapter2", default=ADAPTER2)
    parser.add_argument("-1", "--fastq1")
    parser.add_argument("-2", "--fastq2")
    parser.add_argument("-o", "--output", default="-")
    args = parser.parse_args()

    with xopen(args.fastq1) as fq1, xopen(args.fastq2) as fq2:
        metrics = estimate_metrics(fq1, fq2, args.adapter1, args.adapter2)

    with open_output(args.output) as o:
        print("Avg error prob: {}".format(metrics[0]), file=o)
        print("Read 1 with full-length adapters: {}".format(metrics[1]),
              file=o)
        print("Read 1 full-length adapter bases: {}".format(metrics[2]),
              file=o)
        print("Read 2 with full-length adapters: {}".format(metrics[3]),
              file=o)
        print("Read 2 full-length adapter bases: {}".format(metrics[4]),
              file=o)
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.set_defaults(command=None)
    parser.add_argument("-d", "--bam-dir")
    parser.add_argument("-x", "--bam-extension", default=".sorted.bam")
    parser.add_argument("-p", "--bam-pattern", default=None)
    parser.add_argument("-u", "--untrimmed-name", default="untrimmed")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-H", "--hist", default="trimmed_hists.txt")
    parser.add_argument("-m", "--max-reads", type=int, default=None)
    parser.add_argument("--no-edit-distance", action="store_false",
                        default=True, dest="edit_distance",
                        help="Don't try to match by editdistance.")
    parser.add_argument("--no-progress", action="store_false", dest="progress",
                        default=True)
    sp = parser.add_subparsers()
    
    amplicon = sp.add_parser('amplicon')
    amplicon.set_defaults(command='amplicon')
    amplicon.add_argument(
        "-b", "--bed", default=None,
        help="Sorted .bed file of regions where reads should map.")
    amplicon.add_argument(
        "--min-overlap", type=float, default=1.0,
        help="When a .bed file is specified, this is the minimum "
            "fraction of a mapped read that must overlap a selected "
            "interval for that mapping to be considered valid. (1.0)")
    amplicon.add_argument(
        "--slop", type=int, default=None,
        help="When a .bed file is specified, this is the number of bp each "
            "region is extended. This is often necessary with amplicon data "
            "because enrichment can capture sequences that only paritally "
            "overlap the probes.")
    
    mrna = sp.add_parser('mrna')
    mrna.set_defaults(command='mrna')
    mrna.add_argument(
        "-D", "--bed-dir", default=None,
        help="Directory where to find annotation bed files. Defaults to "
            "--bam-dir.")
    mrna.add_argument(
        "-B", "--bed-pattern", default='{name}.bed',
        help="String template for bed file names. \{name\} is replaced with the "
            "BAM file name with extension (--bam-extension) removed.")
    
    args = parser.parse_args()
    
    if args.edit_distance:
        import editdistance
    
    trimmed = {}
    untrimmed = None
    pattern = (args.bam_pattern or "*{}").format(args.bam_extension)
    for path in glob(os.path.join(args.bam_dir, pattern)):
        name = os.path.basename(path)[:-len(args.bam_extension)]
        if name == args.untrimmed_name:
            untrimmed = BAMReader(path)
        else:
            trimmed[name] = BAMReader(path)
    if untrimmed is None:
        untrimmed = BAMReader(os.path.join(
            args.bam_dir,
            "{}{}".format(args.untrimmed_name, args.bam_extension)))
    
    regions = None
    if args.command == 'amplicon':
        regions = Bed(args.bed, args.slop or 200)
    elif args.command == 'mrna':
        regions = Annotations(
            args.bed_dir or args.bam_dir,
            args.bed_pattern,
            args.untrimmed_name,
            trimmed.keys())
    
    try:
        with open_output(args.output) as o, open_output(args.hist) as h:
            ow = csv.writer(o, delimiter="\t")
            write_header(ow)
            hw = csv.writer(h, delimiter="\t")
            hw.writerow(('prog','read', 'side', 'pos', 'base', 'count'))
            summarize(untrimmed, trimmed, ow, hw,
                mode=args.command, regions=regions, max_reads=args.max_reads,
                use_edit_distance=args.edit_distance, progress=args.progress)
    finally:
        if untrimmed:
            untrimmed.close()
        for t in trimmed.values():
            t.close()
        if regions:
            regions.close()
Beispiel #14
0
def main():
    parser = argparse.ArgumentParser()
    parser.set_defaults(command=None)
    parser.add_argument("-d", "--bam-dir")
    parser.add_argument("-x", "--bam-extension", default=".sorted.bam")
    parser.add_argument("-p", "--bam-pattern", default=None)
    parser.add_argument("-u", "--untrimmed-name", default="untrimmed")
    parser.add_argument("-o", "--output", default="-")
    parser.add_argument("-H", "--hist", default="trimmed_hists.txt")
    parser.add_argument("-m", "--max-reads", type=int, default=None)
    parser.add_argument("--no-edit-distance",
                        action="store_false",
                        default=True,
                        dest="edit_distance",
                        help="Don't try to match by editdistance.")
    parser.add_argument("--no-progress",
                        action="store_false",
                        dest="progress",
                        default=True)
    sp = parser.add_subparsers()

    amplicon = sp.add_parser('amplicon')
    amplicon.set_defaults(command='amplicon')
    amplicon.add_argument(
        "-b",
        "--bed",
        default=None,
        help="Sorted .bed file of regions where reads should map.")
    amplicon.add_argument(
        "--min-overlap",
        type=float,
        default=1.0,
        help="When a .bed file is specified, this is the minimum "
        "fraction of a mapped read that must overlap a selected "
        "interval for that mapping to be considered valid. (1.0)")
    amplicon.add_argument(
        "--slop",
        type=int,
        default=None,
        help="When a .bed file is specified, this is the number of bp each "
        "region is extended. This is often necessary with amplicon data "
        "because enrichment can capture sequences that only paritally "
        "overlap the probes.")

    mrna = sp.add_parser('mrna')
    mrna.set_defaults(command='mrna')
    mrna.add_argument(
        "-D",
        "--bed-dir",
        default=None,
        help="Directory where to find annotation bed files. Defaults to "
        "--bam-dir.")
    mrna.add_argument(
        "-B",
        "--bed-pattern",
        default='{name}.bed',
        help="String template for bed file names. \{name\} is replaced with the "
        "BAM file name with extension (--bam-extension) removed.")

    args = parser.parse_args()

    if args.edit_distance:
        import editdistance

    trimmed = {}
    untrimmed = None
    pattern = (args.bam_pattern or "*{}").format(args.bam_extension)
    for path in glob(os.path.join(args.bam_dir, pattern)):
        name = os.path.basename(path)[:-len(args.bam_extension)]
        if name == args.untrimmed_name:
            untrimmed = BAMReader(path)
        else:
            trimmed[name] = BAMReader(path)
    if untrimmed is None:
        untrimmed = BAMReader(
            os.path.join(
                args.bam_dir, "{}{}".format(args.untrimmed_name,
                                            args.bam_extension)))

    regions = None
    if args.command == 'amplicon':
        regions = Bed(args.bed, args.slop or 200)
    elif args.command == 'mrna':
        regions = Annotations(args.bed_dir or args.bam_dir, args.bed_pattern,
                              args.untrimmed_name, trimmed.keys())

    try:
        with open_output(args.output) as o, open_output(args.hist) as h:
            ow = csv.writer(o, delimiter="\t")
            write_header(ow)
            hw = csv.writer(h, delimiter="\t")
            hw.writerow(('prog', 'read', 'side', 'pos', 'base', 'count'))
            summarize(untrimmed,
                      trimmed,
                      ow,
                      hw,
                      mode=args.command,
                      regions=regions,
                      max_reads=args.max_reads,
                      use_edit_distance=args.edit_distance,
                      progress=args.progress)
    finally:
        if untrimmed:
            untrimmed.close()
        for t in trimmed.values():
            t.close()
        if regions:
            regions.close()