Ejemplo n.º 1
0
def main():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    parser = argparse.ArgumentParser(
        'Calculate read coverage depth from a bam.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('bam', help='.fasta/fastq file.')
    parser.add_argument('-r',
                        '--regions',
                        nargs='+',
                        help='Only process given regions.')
    parser.add_argument('-p',
                        '--prefix',
                        help='Prefix for output, defaults to basename of bam.')
    parser.add_argument('-s',
                        '--stride',
                        type=int,
                        default=1000,
                        help='Stride in genomic coordinate.')

    args = parser.parse_args()

    bam = pysam.AlignmentFile(args.bam)
    ref_lengths = dict(zip(bam.references, bam.lengths))

    if args.regions is not None:
        regions = parse_regions(args.regions, ref_lengths=ref_lengths)
    else:
        regions = [
            Region(ref_name=r, start=0, end=ref_lengths[r])
            for r in bam.references
        ]

    summary = {}
    for region in regions:

        # write final depth
        prefix = args.prefix
        if prefix is None:
            prefix = os.path.splitext(os.path.basename(args.bam))[0]

        region_str = '{}_{}_{}'.format(region.ref_name, region.start,
                                       region.end)
        depth_fp = '{}_{}.depth.txt'.format(prefix, region_str)

        df = coverage_of_region(region, args.bam, args.stride)
        df.to_csv(depth_fp, sep='\t', index=False)
        summary[region_str] = df['depth'].describe()

    summary_fp = '{}_depth_summary.txt'.format(prefix)
    summary_df = pd.DataFrame(summary).T.reset_index().rename(
        columns={'index': 'region'})
    summary_df.to_csv(summary_fp, index=False, sep='\t')
Ejemplo n.º 2
0
def main():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    parser = argparse.ArgumentParser(
        'subsample bam to uniform or proportional depth',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('bam', help='input bam file.')
    parser.add_argument('depth', nargs='+', type=int, help='Target depth.')
    parser.add_argument('-o',
                        '--output_prefix',
                        default='sub_sampled',
                        help='Output prefix')
    parser.add_argument('-r',
                        '--regions',
                        nargs='+',
                        help='Only process given regions.')
    parser.add_argument(
        '-p',
        '--profile',
        type=int,
        default=1000,
        help='Stride in genomic coordinates for depth profile.')
    parser.add_argument('-O',
                        '--orientation',
                        choices=['fwd', 'rev'],
                        help='Sample only forward or reverse reads.')
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        default=-1,
                        help='Number of threads to use.')
    parser.add_argument('-q',
                        '--quality',
                        type=float,
                        help='Filter reads by mean qscore.')
    parser.add_argument('-a',
                        '--accuracy',
                        type=float,
                        help='Filter reads by accuracy.')
    parser.add_argument(
        '-c',
        '--coverage',
        type=float,
        help='Filter reads by coverage (what fraction of the read aligns).')

    eparser = parser.add_mutually_exclusive_group()
    eparser.add_argument(
        '--any_fail',
        action='store_true',
        help='Exit with an error if any region has insufficient coverage.')
    eparser.add_argument(
        '--all_fail',
        action='store_true',
        help='Exit with an error if all regions have insufficient coverage.')

    uparser = parser.add_argument_group('Uniform sampling options')
    uparser.add_argument(
        '-x',
        '--patience',
        default=5,
        type=int,
        help=
        'Maximum iterations with no change in median coverage before aborting.'
    )
    uparser.add_argument(
        '-s',
        '--stride',
        type=int,
        default=1000,
        help=
        'Stride in genomic coordinates when searching for new reads. Smaller can lead to more compact pileup.'
    )

    pparser = parser.add_argument_group('Proportional sampling options')
    pparser.add_argument(
        '-P',
        '--proportional',
        default=False,
        action='store_true',
        help=
        'Activate proportional sampling, thus keeping depth variations of the pileup.'
    )
    pparser.add_argument(
        '-S',
        '--seed',
        default=None,
        type=int,
        help='Random seed for proportional downsampling of reads.')

    args = parser.parse_args()
    if args.threads == -1:
        args.threads = multiprocessing.cpu_count()

    with pysam.AlignmentFile(args.bam) as bam:
        ref_lengths = dict(zip(bam.references, bam.lengths))

        if args.regions is not None:
            regions = parse_regions(args.regions, ref_lengths=ref_lengths)
        else:
            regions = [
                Region(ref_name=r, start=0, end=ref_lengths[r])
                for r in bam.references
            ]

    if args.proportional:
        worker = functools.partial(subsample_region_proportionally, args=args)
    else:
        worker = functools.partial(subsample_region_uniformly, args=args)

    enough_depth = []
    with ProcessPoolExecutor(max_workers=args.threads) as executor:
        for res in executor.map(worker, regions):
            enough_depth.append(res)

    if args.any_fail and not all(enough_depth):
        raise RuntimeError(
            'Insufficient read coverage for one or more requested regions.')
    if args.all_fail and all([not x for x in enough_depth]):
        raise RuntimeError(
            'Insufficient read coverage for all requested regions.')
Ejemplo n.º 3
0
def main():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    parser = argparse.ArgumentParser(
        'subsample bam and fastx to create fastx with uniform depth')
    parser.add_argument('bam', help='input bam file.')
    parser.add_argument('fastx', help='input .fasta/fastq file of reads.')
    parser.add_argument('depth', type=int, help='Target depth.')
    parser.add_argument('-i',
                        '--ifmt',
                        default='fasta',
                        choices=['fasta', 'fastq'],
                        help='Input format.')
    parser.add_argument('-o',
                        '--output_prefix',
                        default='sub_sampled',
                        help='Output prefix')
    parser.add_argument(
        '-d',
        '--damping',
        default=0.5,
        type=float,
        help=
        'Fraction of reads required to achieve target depth to add at each iteration. '
        +
        'Setting 1 will achieve target depth in one iteration, but will likely result in '
        + 'excess depth in places, and thus less uniform depth.')
    parser.add_argument('-c',
                        '--chkpnt',
                        default=100,
                        type=int,
                        help='Frequency at which to write depths and reads.')
    parser.add_argument('-r',
                        '--regions',
                        nargs='+',
                        help='Only process given regions.')
    parser.add_argument('-s',
                        '--stride',
                        type=int,
                        default=1,
                        help='Stride in genomic coordinates.')
    parser.add_argument('-D',
                        '--direction',
                        choices=['fwd', 'rev'],
                        help='Sample only forward or reverse reads.')

    args = parser.parse_args()

    bam = pysam.AlignmentFile(args.bam)
    ref_lengths = dict(zip(bam.references, bam.lengths))

    if args.regions is not None:
        regions = parse_regions(args.regions, ref_lengths=ref_lengths)

    else:
        regions = [
            Region(ref_name=r, start=0, end=ref_lengths[r])
            for r in bam.references
        ]

    _read_filter_ = {
        'fwd': lambda r: not r.is_reverse,
        'rev': lambda r: r.is_reverse,
        None: True,
    }

    for region in regions:
        reads_kept = set()
        bins = np.arange(region.start, region.end, args.stride)
        msg = 'Processing region {}:{}-{}'
        logging.info(msg.format(region.ref_name, bins[0], bins[-1]))
        coverage = np.zeros(len(bins))
        count = 0
        low_cov_sites = {}
        prefix = '{}_{}X'.format(args.output_prefix, args.depth)
        if args.direction is not None:
            prefix = '{}_{}'.format(prefix, args.direction)
        while True:
            bin_i = np.argmin(coverage)
            if coverage[bin_i] >= args.depth:
                break
            count += 1
            if count % args.chkpnt == 0:
                logging.info('Min depth {} at {} (Target depth {})'.format(
                    coverage[bin_i], bins[bin_i], args.depth))
                checkpoint(region.ref_name, bins, coverage, reads_kept,
                           low_cov_sites, prefix)
            pos = bins[bin_i]
            reads = [
                r for r in bam.fetch(
                    contig=region.ref_name, start=pos, end=pos + 1)
                if _read_filter_[args.direction](r)
            ]
            reads_set = set((r.query_name for r in reads))
            reads_in_common = reads_kept.intersection(reads_set)
            reads_not_used = reads_set.difference(reads_in_common)
            if len(reads_not_used) == 0:
                low_cov_sites[bins[bin_i]] = coverage[bin_i]
                coverage[bin_i] = np.inf
                logging.info('Insufficient depth ({}X) at {}:{}'.format(
                    low_cov_sites[bins[bin_i]], region.ref_name, pos))
                continue
            n_reads_to_add = max(
                int(round(args.damping * (args.depth - coverage[bin_i]), 0)),
                1)
            logging.info('Adding {} reads at {}:{} (depth {})'.format(
                n_reads_to_add, region.ref_name, pos, coverage[bin_i]))
            n_reads_to_add = min(n_reads_to_add, len(reads_not_used))
            reads_to_add = np.random.choice(list(reads_not_used),
                                            n_reads_to_add,
                                            replace=False)
            reads_kept.update(reads_to_add)
            # update coverage
            r_objs = [r for r in reads if r.query_name in reads_to_add]
            for r_obj in r_objs:
                start_i = max((r_obj.reference_start - bins[0]) // args.stride,
                              0)
                end_i = min((r_obj.reference_end - bins[0]) // args.stride,
                            len(bins))
                coverage[start_i:end_i] += 1

        # write final depth and reads
        checkpoint(region.ref_name, bins, coverage, reads_kept, low_cov_sites,
                   prefix)
        fastx_ndx = SeqIO.index(args.fastx, args.ifmt)
        output = '{}_{}.{}'.format(prefix, region.ref_name, args.ifmt)
        logging.info('Writing {} sequences to {}'.format(
            len(reads_kept), output))
        with open(output, 'w') as fh:
            seqs = (fastx_ndx[k] for k in reads_kept)
            SeqIO.write(seqs, fh, args.ifmt)
Ejemplo n.º 4
0
def main():
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s',
                        datefmt='%H:%M:%S',
                        level=logging.INFO)
    parser = argparse.ArgumentParser(
        'Calculate read coverage depth from a bam.')
    parser.add_argument('bam', help='.fasta/fastq file.')
    parser.add_argument('-r',
                        '--regions',
                        nargs='+',
                        help='Only process given regions.')
    parser.add_argument('-p',
                        '--prefix',
                        help='Prefix for output, defaults to basename of bam.')
    parser.add_argument('-s',
                        '--stride',
                        type=int,
                        default=1,
                        help='Stride in genomic coordinate.')

    args = parser.parse_args()

    bam = pysam.AlignmentFile(args.bam)
    ref_lengths = dict(zip(bam.references, bam.lengths))

    if args.regions is not None:
        regions = parse_regions(args.regions, ref_lengths=ref_lengths)

    else:
        regions = [
            Region(ref_name=r, start=0, end=ref_lengths[r])
            for r in bam.references
        ]

    for region in regions:
        ref_len = ref_lengths[region.ref_name]
        bins = np.arange(region.start, region.end, args.stride)
        msg = 'Processing reference {}:{}-{}'
        logging.info(msg.format(region.ref_name, bins[0], bins[-1]))
        coverage_by_is_rev = {
            True: np.zeros(len(bins)),
            False: np.zeros(len(bins))
        }
        for r_obj in bam.fetch(contig=region.ref_name,
                               start=region.start,
                               end=region.end):
            start_i = max((r_obj.reference_start - bins[0]) // args.stride, 0)
            end_i = min((r_obj.reference_end - bins[0]) // args.stride,
                        len(bins))
            coverage_by_is_rev[r_obj.is_reverse][start_i:end_i] += 1

        # write final depth
        prefix = args.prefix
        if prefix is None:
            prefix = os.path.splitext(os.path.basename(args.bam))[0]

        depth_fp = '{}_{}_{}_{}.depth.txt'.format(prefix, region.ref_name,
                                                  region.start, region.end)
        pd.DataFrame({
            'pos':
            bins,
            'depth':
            coverage_by_is_rev[True] + coverage_by_is_rev[False],
            'depth_fwd':
            coverage_by_is_rev[False],
            'depth_rev':
            coverage_by_is_rev[True],
        }).to_csv(depth_fp, sep='\t', index=False)