Esempio n. 1
0
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)
    try:
        score_fname = args[0]
        interval_fname = args[1]
        if len(args) > 2:
            out_file = open(args[2], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except Exception:
        doc_optparse.exit()

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        scores_by_chrom = load_scores_wiggle(score_fname)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    for line in open(interval_fname):
        fields = line.split()
        chrom, start, stop = fields[0], int(fields[1]), int(fields[2])
        total = 0
        count = 0
        min_score = 100000000
        max_score = -100000000
        for i in range(start, stop):
            if chrom in scores_by_chrom and scores_by_chrom[chrom][i]:
                # Skip if base is masked
                if masks and chrom in masks:
                    if masks[chrom][i]:
                        continue
                # Get the score, only count if not 'nan'
                score = scores_by_chrom[chrom][i]
                if not isNaN(score):
                    total += score
                    count += 1
                    max_score = max(score, max_score)
                    min_score = min(score, min_score)
        if count > 0:
            avg = total / count
        else:
            avg = "nan"
            min_score = "nan"
            max_score = "nan"

        print("\t".join(
            map(str, [chrom, start, stop, avg, min_score, max_score])),
              file=out_file)

    out_file.close()
Esempio n. 2
0
def main():
    region_fname, exclude_fname, window_size = sys.argv[1], sys.argv[2], int( sys.argv[3] ) 
    exclude_bitsets = binned_bitsets_from_file( open( exclude_fname ) )
    for line in open( region_fname ):
        fields = line.split()
        chr, start, end = fields[0], 0, int( fields[1] )
        if chr not in exclude_bitsets:
            do_windows( chr, start, end, window_size )
        else:
            bits = exclude_bitsets[chr]
            assert end < bits.size
            e = 0
            while 1:
                s = bits.next_clear( e )
                if s > end: break
                e = bits.next_set( s )
                do_windows( chr, s, min( e, end ), window_size )
def main():
    region_fname, exclude_fname, window_size = sys.argv[1], sys.argv[2], int( sys.argv[3] ) 
    exclude_bitsets = binned_bitsets_from_file( open( exclude_fname ) )
    for line in open( region_fname ):
        fields = line.split()
        chr, start, end = fields[0], 0, int( fields[1] )
        if chr not in exclude_bitsets:
            do_windows( chr, start, end, window_size )
        else:
            bits = exclude_bitsets[chr]
            assert end < bits.size
            e = 0
            while 1:
                s = bits.next_clear( e )
                if s > end: break
                e = bits.next_set( s )
                do_windows( chr, s, min( e, end ), window_size )
def print_bits_as_bed( bits ):
    end = 0
    while 1:
        start = bits.next_set( end )
        if start == bits.size: break
        end = bits.next_clear( start )
        print "%s\t%d\t%d" % ( chrom, start, end )

options, args = doc_optparse.parse( __doc__ )
try:
    in_fname, in2_fname = args
except:
    doc_optparse.exit()

# Read first bed into some bitsets

bitsets1 = binned_bitsets_from_file( open( in_fname ) )
bitsets2 = binned_bitsets_from_file( open( in2_fname ) )

for chrom in bitsets1:  
    if chrom not in bitsets1:
        continue
    bits1 = bitsets1[chrom]
    if chrom in bitsets2:
        bits2 = bitsets2[chrom]
        bits2.invert()
        bits1.iand( bits2 )
    print_bits_as_bed( bits1 )
    
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)

    try:
        score_fname = args[0]
        interval_fname = args[1]
        chrom_col = args[2]
        start_col = args[3]
        stop_col = args[4]
        if len(args) > 5:
            out_file = open(args[5], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except:
        doc_optparse.exit()

    if score_fname == 'None':
        stop_err('This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.')

    try:
        chrom_col = int(chrom_col) - 1
        start_col = int(start_col) - 1
        stop_col = int(stop_col) - 1
    except:
        stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.')

    if chrom_col < 0 or start_col < 0 or stop_col < 0:
        stop_err('Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.')

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        try:
            chrom_buffer = int(options.chrom_buffer)
        except:
            chrom_buffer = 3
        scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''

    for i, line in enumerate(open(interval_fname)):
        valid = True
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            fields = line.split()

            try:
                chrom, start, stop = fields[chrom_col], int(fields[start_col]), int(fields[stop_col])
            except:
                valid = False
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
            if valid:
                total = 0
                count = 0
                min_score = 100000000
                max_score = -100000000
                for j in range(start, stop):
                    if chrom in scores_by_chrom:
                        try:
                            # Skip if base is masked
                            if masks and chrom in masks:
                                if masks[chrom][j]:
                                    continue
                            # Get the score, only count if not 'nan'
                            score = scores_by_chrom[chrom][j]
                            if not isnan(score):
                                total += score
                                count += 1
                                max_score = max(score, max_score)
                                min_score = min(score, min_score)
                        except:
                            continue
                if count > 0:
                    avg = total / count
                else:
                    avg = "nan"
                    min_score = "nan"
                    max_score = "nan"

                # Build the resulting line of data
                out_line = []
                for k in range(0, len(fields)):
                    out_line.append(fields[k])
                out_line.append(avg)
                out_line.append(min_score)
                out_line.append(max_score)

                print("\t".join(map(str, out_line)), file=out_file)
            else:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
        elif line.startswith('#'):
            # We'll save the original comments
            print(line, file=out_file)

    out_file.close()

    if skipped_lines > 0:
        print('Data issue: skipped %d invalid lines starting at line #%d which is "%s"' % (skipped_lines, first_invalid_line, invalid_line))
        if skipped_lines == i:
            print('Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.')
Esempio n. 6
0
covered in both of the inputs will be output.

usage: %prog bed_file_1 bed_file_2
"""
from __future__ import print_function

from bx.bitset_builders import binned_bitsets_from_file
from bx.cookbook import doc_optparse

options, args = doc_optparse.parse(__doc__)
try:
    in_fname, in2_fname = args
except ValueError:
    doc_optparse.exit()

bits1 = binned_bitsets_from_file(open(in_fname))
bits2 = binned_bitsets_from_file(open(in2_fname))

bitsets = dict()

for key in bits1:
    if key in bits2:
        bits1[key].iand(bits2[key])
        bitsets[key] = bits1[key]

for chrom in bitsets:
    bits = bitsets[chrom]
    end = 0
    while True:
        start = bits.next_set(end)
        if start == bits.size:
def main():

    # Parse command line
    options, args = doc_optparse.parse(__doc__)

    try:
        score_fname = args[0]
        interval_fname = args[1]
        chrom_col = args[2]
        start_col = args[3]
        stop_col = args[4]
        if len(args) > 5:
            out_file = open(args[5], 'w')
        else:
            out_file = sys.stdout
        binned = bool(options.binned)
        mask_fname = options.mask
    except Exception:
        doc_optparse.exit()

    if score_fname == 'None':
        stop_err(
            'This tool works with data from genome builds hg16, hg17 or hg18.  Click the pencil icon in your history item to set the genome build if appropriate.'
        )

    try:
        chrom_col = int(chrom_col) - 1
        start_col = int(start_col) - 1
        stop_col = int(stop_col) - 1
    except Exception:
        stop_err(
            'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.'
        )

    if chrom_col < 0 or start_col < 0 or stop_col < 0:
        stop_err(
            'Chrom, start & end column not properly set, click the pencil icon in your history item to set these values.'
        )

    if binned:
        scores_by_chrom = load_scores_ba_dir(score_fname)
    else:
        try:
            chrom_buffer = int(options.chrom_buffer)
        except Exception:
            chrom_buffer = 3
        scores_by_chrom = load_scores_wiggle(score_fname, chrom_buffer)

    if mask_fname:
        masks = binned_bitsets_from_file(open(mask_fname))
    else:
        masks = None

    skipped_lines = 0
    first_invalid_line = 0
    invalid_line = ''

    for i, line in enumerate(open(interval_fname)):
        valid = True
        line = line.rstrip('\r\n')
        if line and not line.startswith('#'):
            fields = line.split()

            try:
                chrom, start, stop = fields[chrom_col], int(
                    fields[start_col]), int(fields[stop_col])
            except Exception:
                valid = False
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
            if valid:
                total = 0
                count = 0
                min_score = 100000000
                max_score = -100000000
                for j in range(start, stop):
                    if chrom in scores_by_chrom:
                        try:
                            # Skip if base is masked
                            if masks and chrom in masks:
                                if masks[chrom][j]:
                                    continue
                            # Get the score, only count if not 'nan'
                            score = scores_by_chrom[chrom][j]
                            if not isnan(score):
                                total += score
                                count += 1
                                max_score = max(score, max_score)
                                min_score = min(score, min_score)
                        except Exception:
                            continue
                if count > 0:
                    avg = total / count
                else:
                    avg = "nan"
                    min_score = "nan"
                    max_score = "nan"

                # Build the resulting line of data
                out_line = []
                for k in range(0, len(fields)):
                    out_line.append(fields[k])
                out_line.append(avg)
                out_line.append(min_score)
                out_line.append(max_score)

                print("\t".join(map(str, out_line)), file=out_file)
            else:
                skipped_lines += 1
                if not invalid_line:
                    first_invalid_line = i + 1
                    invalid_line = line
        elif line.startswith('#'):
            # We'll save the original comments
            print(line, file=out_file)

    out_file.close()

    if skipped_lines > 0:
        print(
            'Data issue: skipped %d invalid lines starting at line #%d which is "%s"'
            % (skipped_lines, first_invalid_line, invalid_line))
        if skipped_lines == i:
            print(
                'Consider changing the metadata for the input dataset by clicking on the pencil icon in the history item.'
            )
Esempio n. 8
0
def main():
    parser = OptionParser(
        usage=
        "usage: %prog [options] maf_file snp_file neutral_file window_size step_size"
    )
    parser.add_option("-o", "--outfile", help="Specify file for output")
    parser.add_option("-s", "--species", type="string", default="panTro2")
    parser.add_option("-b", "--build", type="string", default="hg18")
    (options, args) = parser.parse_args()

    if len(args) != 5:
        parser.error("Incorrect number of arguments")
    else:
        maf_filename = args[0]
        snp_filename = args[1]
        neutral_filename = args[2]
        window_size = int(args[3])
        step_size = int(args[4])

    if options.outfile is not None:
        out_file = open(options.outfile, 'w')

    # Generate snp and neutral bitsets
    AR_snp_bitsets = binned_bitsets_from_file(open(snp_filename))
    neutral_bitsets = binned_bitsets_from_file(open(neutral_filename))

    # Generate divergence bitset from maf file
    AR_div_bitsets = dict()
    chr_lens = dict()
    reader = bx.align.maf.Reader(open(maf_filename))

    for block in reader:
        comp1 = block.get_component_by_src_start(options.build)
        comp2 = block.get_component_by_src_start(options.species)

        if comp1 is None or comp2 is None:
            continue

        # Chromosome, start, and stop of reference species alignment
        chr = comp1.src.split('.')[1]
        start = comp1.start

        # Get or create bitset for this chromosome
        if chr in AR_div_bitsets:
            bitset = AR_div_bitsets[chr]
        else:
            bitset = AR_div_bitsets[chr] = bx.bitset.BinnedBitSet()
            chr_lens[chr] = comp1.get_src_size()

        # Iterate over text and set diverged bit
        pos = start
        for ch1, ch2 in zip(comp1.text.upper(), comp2.text.upper()):
            if ch1 == '-':
                continue
            if ch2 == '-':
                pos += 1
                continue

            if ch1 != ch2 and not AR_snp_bitsets[chr][pos]:
                bitset.set(pos)
            pos += 1

    # Debugging Code


#     for chr in AR_div_bitsets:
#         for pos in range(0, AR_div_bitsets[chr].size):
#             if AR_div_bitsets[pos]:
#                 print >> sys.stderr, chr, pos, pos+1

# Copy div and snp bitsets
    nonAR_snp_bitsets = dict()
    for chr in AR_snp_bitsets:
        nonAR_snp_bitsets[chr] = bx.bitset.BinnedBitSet()
        nonAR_snp_bitsets[chr].ior(AR_snp_bitsets[chr])

    nonAR_div_bitsets = dict()
    for chr in AR_div_bitsets:
        nonAR_div_bitsets[chr] = bx.bitset.BinnedBitSet()
        nonAR_div_bitsets[chr].ior(AR_div_bitsets[chr])

    # Generates AR snps by intersecting with neutral intervals
    for chr in AR_snp_bitsets:
        AR_snp_bitsets[chr].iand(neutral_bitsets[chr])

    # Generates AR divs by intersecting with neutral intervals
    for chr in AR_div_bitsets:
        AR_div_bitsets[chr].iand(neutral_bitsets[chr])

    # Inverts the neutral intervals so now represents nonAR
    for chr in neutral_bitsets:
        neutral_bitsets[chr].invert()

    # Generates nonAR snps by intersecting with masked neutral intervals
    for chr in nonAR_snp_bitsets:
        nonAR_snp_bitsets[chr].iand(neutral_bitsets[chr])
    # Generates nonAR divs by intersecting with masked neutral intervals
    for chr in nonAR_div_bitsets:
        nonAR_div_bitsets[chr].iand(neutral_bitsets[chr])

    for chr in AR_div_bitsets:
        for window in range(0, chr_lens[chr] - window_size, step_size):
            # neutral_size = neutral_bitsets[chr].count_range(window, window_size)
            # if neutral_size < 9200: continue
            AR_snp = AR_snp_bitsets[chr].count_range(window, window_size)
            AR_div = AR_div_bitsets[chr].count_range(window, window_size)
            nonAR_snp = nonAR_snp_bitsets[chr].count_range(window, window_size)
            nonAR_div = nonAR_div_bitsets[chr].count_range(window, window_size)

            if nonAR_snp >= 6 and nonAR_div >= 6 and AR_snp >= 6 and AR_div >= 6:
                MK_pval = MK_chi_pvalue(nonAR_snp, nonAR_div, AR_snp, AR_div)
            else:
                MK_pval = MK_fisher_pvalue(nonAR_snp, nonAR_div, AR_snp,
                                           AR_div)

            if options.outfile is not None:
                out_file.write("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f\n" %
                               (chr, window, window + window_size, nonAR_snp,
                                nonAR_div, AR_snp, AR_div, MK_pval))
            else:
                print("%s\t%d\t%d\t%d\t%d\t%d\t%d\t%1.15f" %
                      (chr, window, window + window_size, nonAR_snp, nonAR_div,
                       AR_snp, AR_div, MK_pval))

    if options.outfile is not None:
        out_file.close()
Esempio n. 9
0
def main():
    options, args = doc_optparse.parse(__doc__)
    try:
        lens = {}
        if options.lens:
            for line in open(options.lens):
                chrom, length = line.split()
                lens[chrom] = int(length)

        if options.suffix:
            suffix = options.suffix
        else:
            suffix = ""

        print("\nReading feature", end=' ', file=sys.stderr)
        interval_file = open(args[0])
        feature = binned_bitsets_from_file(interval_file, lens=lens)
        interval_file.close()
        # reuse interval file
        intervals = {}
        interval_file = open(args[0])
        for line in interval_file:
            fields = line.split()
            chrom, start, end = fields[0], int(fields[1]), int(fields[2])
            if chrom not in intervals:
                intervals[chrom] = []
            intervals[chrom].append([start, end])
        interval_file.close()

        print("\nReading ar", end=' ', file=sys.stderr)
        ar = binned_bitsets_from_file(open(args[1]), lens=lens)

        print("\nReading snps", end=' ', file=sys.stderr)
        snp = binned_bitsets_from_file(open(args[2]), lens=lens)
        snp_mask = clone_inverted(snp)
        snp_copy = clone(snp)

        print("\nMasking AR", end=' ', file=sys.stderr)
        ar_mask = clone_inverted(ar)
        print(file=sys.stderr)

        dirname = args[3]

        if options.mask:
            mask = binned_bitsets_from_file(open(options.mask), lens=lens)
        else:
            mask = None
    except Exception:
        doc_optparse.exit()

    if mask:
        for chrom in mask.keys():
            if chrom in feature:
                feature[chrom].iand(mask[chrom])
            if chrom in ar:
                ar[chrom].iand(mask[chrom])

    # divergence and snp counts for all features
    feature_div_count = 0
    feature_snp_count = 0
    ar_div_count = 0
    ar_snp_count = 0

    # collect snp and div
    for chr in feature.keys():

        if chr not in snp:
            continue
        if chr not in ar:
            continue

        print("reading %s ..." % chr, end=' ', file=sys.stderr)
        try:
            div = binned_bitsets_from_file(open(dirname + "/%s.bed" % (chr+suffix)), lens=lens)
        except Exception:
            print("%s.bed not found" % chr, file=sys.stderr)
            continue

        div[chr].iand(snp_mask[chr])  # div/snp sites count snp-only
        div_copy = clone(div)

        print("AR:", chr, end=' ', file=sys.stderr)
        snp[chr].iand(ar[chr])
        div[chr].iand(ar[chr])
        snp_count = snp[chr].count_range(0, snp[chr].size)
        ar_snp_count += snp_count
        print(snp_count, end=' ', file=sys.stderr)
        try:
            div_count = div[chr].count_range(0, div[chr].size)
            ar_div_count += div_count
            print(div_count, file=sys.stderr)
        except Exception:
            print(chr, "failed", file=sys.stderr)

        div = div_copy
        snp[chr] = snp_copy[chr]
        print("feature:", chr, end=' ', file=sys.stderr)
        feature[chr].iand(ar_mask[chr])  # clip to non-AR only
        snp[chr].iand(feature[chr])
        div[chr].iand(feature[chr])
        feature_snp_count += snp[chr].count_range(0, snp[chr].size)
        print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr)
        feature_div_count += div[chr].count_range(0, div[chr].size)
        print(snp[chr].count_range(0, snp[chr].size), div[chr].count_range(0, div[chr].size), file=sys.stderr)

        # Note: can loop over feature intervals here for individual counts
        if chr in intervals:
            for start, end in intervals[chr]:
                ind_div_count = div[chr].count_range(start, end-start)
                ind_snp_count = snp[chr].count_range(start, end-start)
                print(chr, start, end, ind_div_count, ind_snp_count)

    print("feature snp\t%d" % feature_snp_count)
    print("feature div\t%d" % feature_div_count)
    print("ar snp\t%d" % ar_snp_count)
    print("ar div\t%d" % ar_div_count)
Esempio n. 10
0
#!/usr/bin/env python
"""
Print number of bases covered by all intervals in a bed file (bases covered by
more than one interval are counted only once). Multiple bed files can be
provided on the command line or to stdin.

usage: %prog bed files ...
"""
from __future__ import print_function

import sys
from itertools import chain

from bx.bitset_builders import binned_bitsets_from_file

bed_filenames = sys.argv[1:]
if bed_filenames:
    input = chain(*(open(_) for _ in bed_filenames))
else:
    input = sys.stdin

bitsets = binned_bitsets_from_file(input)

total = 0
for chrom in bitsets:
    total += bitsets[chrom].count_range(0, bitsets[chrom].size)

print(total)
Esempio n. 11
0
def read_len(f):
    """Read a 'LEN' file and return a mapping from chromosome to length"""
    mapping = dict()
    for line in f:
        fields = line.split()
        mapping[fields[0]] = int(fields[1])
    return mapping


options, args = doc_optparse.parse(__doc__)
try:
    in_fname, len_fname = args
except Exception:
    doc_optparse.exit()

bitsets = binned_bitsets_from_file(open(in_fname))

lens = read_len(open(len_fname))

for chrom in lens:
    if chrom in bitsets:
        bits = bitsets[chrom]
        bits.invert()
        len = lens[chrom]
        end = 0
        while True:
            start = bits.next_set(end)
            if start == bits.size:
                break
            end = bits.next_clear(start)
            if end > len:
Esempio n. 12
0
"""
For each interval in `bed1` print the fraction of bases covered by `bed2`.

usage: %prog bed1 bed2 [mask]
"""

from __future__ import division, print_function

import sys

from bx.bitset import BinnedBitSet
from bx.bitset_builders import binned_bitsets_from_file

bed1_fname, bed2_fname = sys.argv[1:3]

bitsets = binned_bitsets_from_file(open(bed2_fname))


def clone(bits):
    b = BinnedBitSet(bits.size)
    b.ior(bits)
    return b


if len(sys.argv) > 3:
    mask_fname = sys.argv[3]
    mask = binned_bitsets_from_file(open(mask_fname))
    new_bitsets = dict()
    for key in bitsets:
        if key in mask:
            b = clone(mask[key])