Ejemplo n.º 1
0
def test_bedcov_split_lines():
    bam_filename = "./pysam_data/ex1.bam"
    bed_filename = "./pysam_data/ex1.bed"
    lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True)  # Test pysam 0.8.X style output, which returns a list of lines
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (fields, len(fields))
Ejemplo n.º 2
0
def bedcov(bed_fname, bam_fname):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude 0-MAPQ reads
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, '-Q', '1')
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    for line in lines:
        try:
            chrom, start_s, end_s, name, basecount_s = line.split('\t')
        except:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = map(int, (start_s, end_s, basecount_s.strip()))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        yield chrom, start, end, name, count, mean_depth
Ejemplo n.º 3
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    if min_mapq > 0:
        bedcov_args = ['-Q', str(min_mapq)]
    else:
        bedcov_args = []
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    for line in lines:
        try:
            chrom, start_s, end_s, name, basecount_s = line.split('\t')
        except:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = map(int, (start_s, end_s, basecount_s.strip()))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        yield chrom, start, end, name, count, mean_depth
Ejemplo n.º 4
0
def parallelCov(cov_args):
    (bam, bed) = cov_args
    sampleCov = []
    output = {}
    mq = 0  #can param min mapping quality if desired
    id = os.path.basename(bam)
    id = id.split('_')[0]
    print("Generating coverage metrics for: " + id)
    sys.stdout.flush()
    id = id + "_mean_cvg"
    cmd = [bed, bam]
    cmd.extend(['-Q', bytes(mq)])

    bstring = pysam.bedcov(*cmd, split_lines=False)
    lines = bstring.splitlines()
    targets = []
    for line in lines:
        fields = line.split('\t')
        chr = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        target = fields[3]
        coverage = int(fields[4])
        intlen = float(end - start)
        meancov = str(coverage / intlen)
        target = chr + ":" + str(start) + "-" + str(end)
        targets.append(target)
        sampleCov.append(meancov)
    output.update({id: sampleCov})
    output.update({'Target': targets})

    return (output)
Ejemplo n.º 5
0
def test_bedcov():
    bam_filename = "./pysam_data/ex1.bam"
    bed_filename = "./pysam_data/ex1.bed"
    bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False)  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    lines = bedcov_string.splitlines()
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (fields, len(fields))
Ejemplo n.º 6
0
def compute_coverage_with_samtools(args, input_bam, mean_coverage):
    # Generate GC and chromosome normalized coverage for the entire BAM file
    # Based on https://www.biostars.org/p/92744/
    logging.info(
        "Computing chromosome and GC normalized coverage for %s with samtools and bedtools",
        input_bam,
    )

    with tempfile.NamedTemporaryFile(mode="w", suffix=".bed",
                                     dir=args.tempdir) as window_bed_file:
        # pylint: disable=unexpected-keyword-arg
        windows_bed = bed.BedTool().window_maker(g=args.genome,
                                                 w=args.gc_window_size,
                                                 output=window_bed_file.name)

        windows_table = (
            # pylint: disable=no-member,unexpected-keyword-arg
            bed.BedTool(pysam.bedcov(window_bed_file.name, input_bam,
                                     "--reference", args.reference),
                        from_string=True).nucleotide_content(
                            fi=args.reference).to_dataframe(
                                index_col=False,
                                header=0,
                                usecols=[0, 1, 2, 3, 7, 8, 10, 11, 12],
                                names=[
                                    "chrom",
                                    "start",
                                    "end",
                                    "bases",
                                    "num_C",
                                    "num_G",
                                    "num_N",
                                    "num_oth",
                                    "seq_len",
                                ],
                                dtype={"chrom": str},
                            ))

    # Remove windows with no alignable data
    windows_table["align_len"] = (windows_table.seq_len - windows_table.num_N -
                                  windows_table.num_oth)
    windows_table = windows_table[windows_table.align_len != 0]

    # Compute normalized coverage by chromosome
    norm_coverage_by_chrom = (windows_table.groupby("chrom").apply(
        samtools_norm_coverage_group, mean_coverage).to_dict())

    # Compute normalized coverage by GC bin
    gc_fraction = np.round(
        (windows_table.num_G + windows_table.num_C) / windows_table.align_len,
        2)
    norm_coverage = (windows_table.bases /
                     windows_table.align_len) / mean_coverage
    norm_coverage_by_gc = (norm_coverage.groupby(gc_fraction).agg(
        ["count", "mean"]).to_dict())

    return norm_coverage_by_chrom, norm_coverage_by_gc
Ejemplo n.º 7
0
def test_bedcov_split_lines():
    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
    # Test pysam 0.8.X style output, which returns a list of lines
    lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True)
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], \
            ("bedcov should give tab delimited output with 4 or 5 fields. "
             "Split line (%s) gives %d fields." % (fields, len(fields)))
Ejemplo n.º 8
0
def test_bedcov_split_lines():
    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
    # Test pysam 0.8.X style output, which returns a list of lines
    lines = pysam.bedcov(bed_filename, bam_filename, split_lines=True)
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], \
            ("bedcov should give tab delimited output with 4 or 5 fields. "
             "Split line (%s) gives %d fields." % (fields, len(fields)))
Ejemplo n.º 9
0
def test_bedcov():
    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False)
    lines = bedcov_string.splitlines()
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], \
            ("bedcov should give tab delimited output with 4 or 5 fields. "
             "Split line (%s) gives %d fields." % (fields, len(fields)))
Ejemplo n.º 10
0
def test_bedcov():
    bam_filename = os.path.join(BAM_DATADIR, "ex1.bam")
    bed_filename = os.path.join(BAM_DATADIR, "ex1.bed")
    # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    bedcov_string = pysam.bedcov(bed_filename, bam_filename, split_lines=False)
    lines = bedcov_string.splitlines()
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [4, 5], \
            ("bedcov should give tab delimited output with 4 or 5 fields. "
             "Split line (%s) gives %d fields." % (fields, len(fields)))
Ejemplo n.º 11
0
def test_bedcov_split_lines():
    bam_filename = "./pysam_data/ex1.bam"
    bed_filename = "./pysam_data/ex1.bed"
    lines = pysam.bedcov(
        bed_filename, bam_filename, split_lines=True
    )  # Test pysam 0.8.X style output, which returns a list of lines
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [
            4, 5
        ], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (
            fields, len(fields))
Ejemplo n.º 12
0
def test_bedcov():
    bam_filename = "./pysam_data/ex1.bam"
    bed_filename = "./pysam_data/ex1.bed"
    bedcov_string = pysam.bedcov(
        bed_filename, bam_filename, split_lines=False
    )  # Test pysam 0.9.X style output, which returns a string that needs to be split by \n
    lines = bedcov_string.splitlines()
    for line in lines:
        fields = line.split('\t')
        assert len(fields) in [
            4, 5
        ], "bedcov should give tab delimited output with 4 or 5 fields.  Split line (%s) gives %d fields." % (
            fields, len(fields))
Ejemplo n.º 13
0
def pysam_depth(bam, bed):
    "get number of total base in bed region"
    if not os.path.isfile(bam + '.bai'):
        raise Exception('index for BAM file %s isn\'t found' %(bam))

    cmd = [bed, bam]# ,'-Q', bytes(5)]
    try:
        raw = pysam.bedcov(*cmd, split_lines=False)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. "
                         "PySAM error: %s" % (bam, bed, exc))

    return map(lambda x: int(x.split('\t')[-1]), raw.rstrip().split('\n'))
Ejemplo n.º 14
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    if min_mapq > 0:
        bedcov_args = ['-Q', str(min_mapq)]
    else:
        bedcov_args = []
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    if isinstance(lines, basestring):
        lines = lines.splitlines()
    for line in lines:
        fields = line.split('\t', 5)
        if len(fields) == 5:
            chrom, start_s, end_s, gene, basecount_s = fields
        elif len(fields) == 4:
            chrom, start_s, end_s, basecount_s = fields
            gene = "-"
        else:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = list(map(int, (start_s, end_s, basecount_s.strip())))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        row = (chrom, start, end, gene,
               math.log(mean_depth, 2) if mean_depth else NULL_LOG2_COVERAGE,
               mean_depth)
        yield count, row
Ejemplo n.º 15
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    cmd = [bed_fname, bam_fname]
    if min_mapq and min_mapq > 0:
        cmd.extend(['-Q', bytes(min_mapq)])
    try:
        raw = pysam.bedcov(*cmd, split_lines=False)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. "
                         "PySAM error: %s" % (bam_fname, bed_fname, exc))
    if not raw:
        raise ValueError("BED file %r chromosome names don't match any in "
                         "BAM file %r" % (bed_fname, bam_fname))
    columns = detect_bedcov_columns(raw)
    table = pd.read_table(StringIO(raw), names=columns, usecols=columns)
    return table
Ejemplo n.º 16
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    cmd = [bed_fname, bam_fname]
    if min_mapq and min_mapq > 0:
        cmd.extend(['-Q', bytes(min_mapq)])
    try:
        raw = pysam.bedcov(*cmd, split_lines=False)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. "
                         "PySAM error: %s" % (bam_fname, bed_fname, exc))
    if not raw:
        raise ValueError("BED file %r chromosome names don't match any in "
                         "BAM file %r" % (bed_fname, bam_fname))
    columns = detect_bedcov_columns(raw)
    table = pd.read_table(StringIO(raw), names=columns, usecols=columns)
    return table
Ejemplo n.º 17
0
def bedcov(bed_fname, bam_fname, min_mapq):
    """Calculate depth of all regions in a BED file via samtools (pysam) bedcov.

    i.e. mean pileup depth across each region.
    """
    # Count bases in each region; exclude low-MAPQ reads
    if min_mapq > 0:
        bedcov_args = ['-Q', str(min_mapq)]
    else:
        bedcov_args = []
    try:
        lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s"
                         % (bam_fname, bed_fname, exc))
    if not lines:
        raise ValueError("BED file %r sequence IDs don't match any in BAM file %r"
                         % (bed_fname, bam_fname))
    # Return an iterable...
    if isinstance(lines, basestring):
        lines = lines.splitlines()
    for line in lines:
        fields = line.split('\t')
        if len(fields) == 5:
            chrom, start_s, end_s, name, basecount_s = fields
        elif len(fields) == 4:
            chrom, start_s, end_s, basecount_s = fields
            name = "-"
        else:
            raise RuntimeError("Bad line from bedcov:\n" + line)
        start, end, basecount = list(map(int, (start_s, end_s, basecount_s.strip())))
        span = end - start
        if span > 0:
            # Algebra from above
            count = basecount / READ_LEN
            mean_depth = basecount / span
        else:
            # User-supplied bins might be oddly constructed
            count = mean_depth = 0
        yield chrom, start, end, name, count, mean_depth
Ejemplo n.º 18
0
				outfile = os.fdopen(fd, "w")
	outfile.close()
	if k % chunk_size:
		outfile.close()
		yield name
		
def bedcov(bam_fname, bed_fname): ## pysam.bedcov ===> 'chr1\t200\t300\t2050\n'
	"""Calculate depth of all regions in a BED file via samtools (pysam) bedcov.
	i.e. mean pileup depth across each region.
	"""
	# Count bases in each region; exclude low-MAPQ reads
    cmd = [bed_fname, bam_fname]
    if min_mapq and min_mapq > 0:
        cmd.extend(['-Q', bytes(min_mapq)])
    try:
        raw = pysam.bedcov(*cmd, split_lines=False)
    except pysam.SamtoolsError as exc:
        raise ValueError("Failed processing %r coverages in %r regions. "
                         "PySAM error: %s" % (bam_fname, bed_fname, exc))
    if not raw:
        raise ValueError("BED file %r chromosome names don't match any in "
                         "BAM file %r" % (bed_fname, bam_fname))
    columns = detect_bedcov_columns(raw)
    table = pd.read_csv(StringIO(raw), sep='\t', names=columns, usecols=columns)  #******************
    return table

def detect_bedcov_columns(text):
    """Determine which 'bedcov' output columns to keep.

    Format is the input BED plus a final appended column with the count of
    basepairs mapped within each row's region. The input BED might have 3