Example #1
0
def test_chromsizes():
    assert_raises(OSError, pybedtools.get_chromsizes_from_ucsc, 'dm3', mysql='wrong path')
    assert_raises(ValueError, pybedtools.get_chromsizes_from_ucsc, 'dm3', timeout=0)
    try:

        print pybedtools.chromsizes('dm3')
        print pybedtools.get_chromsizes_from_ucsc('dm3')
        assert pybedtools.chromsizes('dm3') == pybedtools.get_chromsizes_from_ucsc('dm3')

        hg17 = pybedtools.chromsizes('hg17')

        assert hg17['chr1'] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn='hg17.genome')
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = 'chr1\t245522847\n'
        results = open(fn).readline()
        print results
        assert expected == results

        assert_raises(OSError,
                      pybedtools.get_chromsizes_from_ucsc, 
                      **dict(genome='hg17', mysql='nonexistent'))

        os.unlink('hg17.genome')
    except OSError:
        sys.stdout.write("mysql error -- test for chromsizes from UCSC didn't run")
Example #2
0
def test_issue_145():
    x = pybedtools.BedTool(
        """
    chr1    1   100 feature1    0   +
    chr1    1   100 feature1    0   +
    """,
        from_string=True,
    ).saveas("foo.bed")

    g = pybedtools.chromsizes_to_file({"chr1": (0, 200)}, "genome.txt")
    y = x.genome_coverage(g=g, **{"5": True})

    # trying to print causes pybedtools to interpret as a BED file, but it's
    # a histogram so line 2 raises error
    with pytest.raises(pybedtools.MalformedBedLineError):
        print(y)

    # solution is to iterate over lines of file; make sure this works
    for line in open(y.fn):
        print(line)

    # if streaming, iterate over y.fn directly:
    y = x.genome_coverage(g=g, **{"5": True})
    for line in y.fn:
        print(line)
Example #3
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
Example #4
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
Example #5
0
def bam2bigwig(bam, bigwig, genome, scale=1e6, verbose=False):
    """
    Uses BEDTools to go from BAM to bedgraph, then bedGraphToBigWig to get the
    final bigwig.
    """
    if scale is not None:
        cmds = ['samtools', 'view', '-F', '0x4', '-c', bam]
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
        total_reads = float(stdout)
        reads_per_scale = total_reads / scale
        if verbose:
            sys.stderr.write('%s total reads\n' % total_reads)
            sys.stderr.flush()

    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))

    t0 = time.time()
    bedgraph = pybedtools.BedTool(bam)\
            .genome_coverage(bg=True, g=chromsizes, scale=scale)\
            .moveto('bedgraph.bedgraph')
    print bedgraph.fn
    if verbose:
        sys.stderr.write('Completed bedGraph in %.1fs\n' % (time.time() - t0))
        sys.stderr.flush()

    cmds = ['bedGraphToBigWig', bedgraph.fn, chromsizes, bigwig]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()

    if verbose:
        sys.stderr.write('Completed bigWig %s\n' % bigwig)
        sys.stderr.flush()
Example #6
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs["scale"] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = ["bedGraphToBigWig", x.fn, genome_file, output]
    try:
        p = subprocess.Popen(
            cmds,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            universal_newlines=True,
        )
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bedGraphToBigWig was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`")

    if p.returncode and "bedSort" in stderr:
        print("BAM header was not sorted; sorting bedGraph")
        y = x.sort()
        cmds[1] = y.fn
        try:
            p = subprocess.Popen(
                cmds,
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                universal_newlines=True,
            )
            stdout, stderr = p.communicate()
        except FileNotFoundError:
            raise FileNotFoundError(
                "bedSort was not found on the path. This is an external "
                "tool from UCSC which can be downloaded from "
                "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
                "`conda install ucsc-bedgraphtobigwig`")

    if p.returncode:
        raise ValueError("cmds: %s\nstderr: %s\nstdout: %s" %
                         (" ".join(cmds), stderr, stdout))
Example #7
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
Example #8
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
Example #9
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['bedGraphToBigWig', bedgraph.fn, genome_file, output]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))
    return output
Example #10
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
Example #11
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
    return output
Example #12
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['wigToBigWig', wig.fn, genome_file, output]
    subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s' %
                         (' '.join(cmds), stderr, stdout))
    return output
Example #13
0
def bigbed(
    x,
    genome,
    output,
    blockSize=256,
    itemsPerSlot=512,
    bedtype=None,
    _as=None,
    unc=False,
    tab=False,
):
    """
    Converts a BedTool object to a bigBed format and returns the new filename.

    `x` is a BedTool object

    `genome` is an assembly string

    `output` is the name of the bigBed file to create.

    Other args are passed to bedToBigBed.  In particular, `bedtype` (which
    becomes the "-type=" argument) is automatically handled for you if it is
    kept as the default None.

    Assumes that a recent version of bedToBigBed from UCSC is on the path.
    """
    if isinstance(x, six.string_types):
        x = pybedtools.BedTool(x)
    if not isinstance(x.fn, six.string_types):
        x = x.saveas()
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    if bedtype is None:
        bedtype = "bed%s" % x.field_count()
    cmds = [
        "bedToBigBed",
        x.fn,
        chromsizes,
        output,
        "-blockSize=%s" % blockSize,
        "-itemsPerSlot=%s" % itemsPerSlot,
        "-type=%s" % bedtype,
    ]
    if unc:
        cmds.append("-unc")
    if tab:
        cmds.append("-tab")
    if _as:
        cmds.append("-as=%s" % _as)
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))

    return output
Example #14
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    try:
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bedGraphToBigWig was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`"
        )

    if p.returncode and  'bedSort' in stderr:
        print('BAM header was not sorted; sorting bedGraph')
        y = x.sort()
        cmds[1] = y.fn
        try:
            p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
            stdout, stderr = p.communicate()
        except FileNotFoundError:
            raise FileNotFoundError(
                "bedSort was not found on the path. This is an external "
                "tool from UCSC which can be downloaded from "
                "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
                "`conda install ucsc-bedgraphtobigwig`"
            )

    if p.returncode:
        raise ValueError('cmds: %s\nstderr: %s\nstdout: %s'
                         % (' '.join(cmds), stderr, stdout))
Example #15
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]
    subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s'
                         % (' '.join(cmds), stderr, stdout))
    return output
Example #16
0
def test_chromsizes():
    with pytest.raises(OSError):
        pybedtools.get_chromsizes_from_ucsc("dm3",
                                            mysql="wrong path",
                                            fetchchromsizes="wrongtoo")
    with pytest.raises(ValueError):
        pybedtools.get_chromsizes_from_ucsc("dm3", timeout=0)
    try:

        print(pybedtools.chromsizes("dm3"))
        print(pybedtools.get_chromsizes_from_ucsc("dm3"))
        assert pybedtools.chromsizes(
            "dm3") == pybedtools.get_chromsizes_from_ucsc("dm3")

        hg17 = pybedtools.chromsizes("hg17")

        assert hg17["chr1"] == (0, 245522847)

        fn = pybedtools.chromsizes_to_file(hg17, fn="hg17.genome")
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        # make sure the tempfile version works, too
        fn = pybedtools.chromsizes_to_file(hg17, fn=None)
        expected = "chr1\t245522847\n"
        results = open(fn).readline()
        print(results)
        assert expected == results

        with pytest.raises(OSError):
            pybedtools.get_chromsizes_from_ucsc(**dict(
                genome="hg17", mysql="nonexistent", fetchchromsizes="missing"))

        os.unlink("hg17.genome")
    except OSError:
        sys.stdout.write(
            "mysql error -- test for chromsizes from UCSC didn't run")
Example #17
0
def bedgraph_to_bigwig(bedgraph, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'bedGraphToBigWig',
        bedgraph.fn,
        genome_file,
        output]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s"
                         % (" ".join(cmds), stderr, stdout))
    return output
Example #18
0
def bedgraph2bigwig(bedgraph, bigwig, genome, verbose=False):
    """
    Create a bigWig from `bedgraph`.

    :param bedgraph: Input filename of bedgraph
    :param bigwig: Output filename of bigWig to create
    :param genome: String assembly name of genome
    :param verbose: Print messages to stderr
    """
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ['bedGraphToBigWig', bedgraph, chromsizes, bigwig]
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if verbose:
        sys.stderr.write('Completed bigWig %s\n' % bigwig)
        sys.stderr.flush()
Example #19
0
def bigbed(x, genome, output, blockSize=256, itemsPerSlot=512, bedtype=None, _as=None, unc=False, tab=False):
    """
    Converts a BedTool object to a bigBed format and returns the new filename.

    `x` is a BedTool object

    `genome` is an assembly string

    `output` is the name of the bigBed file to create.

    Other args are passed to bedToBigBed.  In particular, `bedtype` (which
    becomes the "-type=" argument) is automatically handled for you if it is
    kept as the default None.

    Assumes that a recent version of bedToBigBed from UCSC is on the path.
    """
    if isinstance(x, str):
        x = pybedtools.BedTool(x)
    if not isinstance(x.fn, str):
        x = x.saveas()
    chromsizes = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    if bedtype is None:
        bedtype = 'bed%s' % x.field_count()
    cmds = [
        'bedToBigBed',
        x.fn,
        chromsizes,
        output,
        '-blockSize=%s' % blockSize,
        '-itemsPerSlot=%s' % itemsPerSlot,
        '-type=%s' % bedtype
    ]
    if unc:
        cmds.append('-unc')
    if tab:
        cmds.append('-tab')
    if _as:
        cmds.append('-as=%s' % _as)
    p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = p.communicate()
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s"
                         % (" ".join(cmds), stderr, stdout))

    return output
Example #20
0
def run_jaccard(Enrichment_Par):
	e = Enrichment_Par
	# cuts out chrom,chromStart, and chromEnd
	A2 = e.A.cut([0,1,2])
	B2 = e.B.cut([0,1,2])	   
	genome_fn = pybedtools.chromsizes_to_file(e.genome)
	resjaccard = A2.random_jaccard(B2,genome_fn=genome_fn,iterations=e.n,
			shuffle_kwargs={'chrom':True})
	jaccard_dist = resjaccard[1]
	jaccard_obs = resjaccard[0]
	jaccard_exp = numpy.mean(resjaccard[1])
	jaccardp_value = 'NA'
	if jaccard_exp == jaccard_obs or (jaccard_exp  == 0 and jaccard_obs == 0):
		jaccardp_value =1
	else:
		jaccardp_value = len([x for x in jaccard_dist if x > jaccard_obs]) / float(len(jaccard_dist))
		jaccardp_value = min(jaccardp_value,1-jaccardp_value)
	return {"jaccardp_value": jaccardp_value, "jaccard_obs": jaccard_obs, "jaccard_exp": jaccard_exp} 
Example #21
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = ["wigToBigWig", wig.fn, genome_file, output]

    try:
        p = subprocess.Popen(cmds,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bigWigToBedGraph was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`")
    if p.returncode:
        raise ValueError("cmds: %s\nstderr:%s\nstdout:%s" %
                         (" ".join(cmds), stderr, stdout))
    return output
Example #22
0
def bam_to_bigwig(bam, genome, output):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    readcount = mapped_read_count(bam)
    scale = 1 / (readcount / 1e6)
    x = pybedtools.BedTool(bam)\
        .genome_coverage(bg=True, scale=scale, split=True, g=genome_file)
    cmds = [
        'bedGraphToBigWig',
        x.fn,
        genome_file,
        output]
    os.system(' '.join(cmds))
Example #23
0
def bam_to_bigwig(bam, genome, output, scale=False):
    """
    Given a BAM file `bam` and assembly `genome`, create a bigWig file scaled
    such that the values represent scaled reads -- that is, reads per million
    mapped reads.

    (Disable this scaling step with scale=False; in this case values will
    indicate number of reads)

    Assumes that `bedGraphToBigWig` from UCSC tools is installed; see
    http://genome.ucsc.edu/goldenPath/help/bigWig.html for more details on the
    format.
    """
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    kwargs = dict(bg=True, split=True, g=genome_file)
    if scale:
        readcount = mapped_read_count(bam)
        _scale = 1 / (readcount / 1e6)
        kwargs['scale'] = _scale
    x = pybedtools.BedTool(bam).genome_coverage(**kwargs)
    cmds = ['bedGraphToBigWig', x.fn, genome_file, output]
    p = subprocess.Popen(cmds,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.PIPE,
                         universal_newlines=True)
    stdout, stderr = p.communicate()

    if p.returncode and 'bedSort' in stderr:
        print('BAM header was not sorted; sorting bedGraph')
        y = x.sort()
        cmds[1] = y.fn
        p = subprocess.Popen(cmds,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             universal_newlines=True)
        stdout, stderr = p.communicate()

    if p.returncode:
        raise ValueError('cmds: %s\nstderr: %s\nstdout: %s' %
                         (' '.join(cmds), stderr, stdout))
Example #24
0
def wig_to_bigwig(wig, genome, output):
    genome_file = pybedtools.chromsizes_to_file(pybedtools.chromsizes(genome))
    cmds = [
        'wigToBigWig',
        wig.fn,
        genome_file,
        output]

    try:
        p = subprocess.Popen(cmds, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = p.communicate()
    except FileNotFoundError:
        raise FileNotFoundError(
            "bigWigToBedGraph was not found on the path. This is an external "
            "tool from UCSC which can be downloaded from "
            "http://hgdownload.soe.ucsc.edu/admin/exe/. Alternatatively, use "
            "`conda install ucsc-bedgraphtobigwig`"
        )
    if p.returncode:
        raise ValueError('cmds: %s\nstderr:%s\nstdout:%s'
                         % (' '.join(cmds), stderr, stdout))
    return output
Example #25
0
def test_issue_145():
    x = pybedtools.BedTool("""
    chr1    1   100 feature1    0   +
    chr1    1   100 feature1    0   +
    """, from_string=True).saveas('foo.bed')

    g = pybedtools.chromsizes_to_file({'chr1': (0, 200)}, 'genome.txt')
    y = x.genome_coverage(g=g, **{'5': True})

    # trying to print causes pybedtools to interpret as a BED file, but it's
    # a histogram so line 2 raises error
    with pytest.raises(pybedtools.MalformedBedLineError):
        print(y)

    # solution is to iterate over lines of file; make sure this works
    for line in open(y.fn):
        print(line)

    # if streaming, iterate over y.fn directly:
    y = x.genome_coverage(g=g, **{'5': True})
    for line in y.fn:
        print(line)
import os
import subprocess
import pybedtools
from trackhub import helpers

data_dir = helpers.data_dir()

chromsizes = pybedtools.genome_registry.dm3.euchromatic
g = pybedtools.chromsizes_to_file(chromsizes)

# Make some randomized bigBed files
for i in range(3):
    x = (
        pybedtools.BedTool("chr2L 0 10000000", from_string=True)
        .window_maker(g=g, w=1000 * (i + 1))
        .shuffle(g=g, seed=i)
        .sort()
    )

    out = os.path.join(data_dir, "random-dm3-%s.bigBed" % i)

    cmds = ["bedToBigBed", x.fn, g, out]
    p = subprocess.Popen(cmds)
    p.communicate()


# make some sine waves for bigWigs
import numpy as np


def sine(factor):
#!/usr/bin/env python
import os
import subprocess
import logging
import hashlib
import urllib
import pybedtools
import gffutils
import metaseq

logging.basicConfig(level=logging.DEBUG,
                    format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger('metaseq data download')

hg19 = pybedtools.chromsizes('hg19')
genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19')

usage = """
Downloads data from UCSC, GEO, and Ensembl.
"""

import argparse
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument('--data-dir',
                default=metaseq.data_dir(),
                help='Location to store downloaded and prepped data.  '
                'Default is %(default)s')
args = ap.parse_args()

CHROM = 'chr17'
COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1])
#!/usr/bin/env python
import os
import subprocess
import logging
import hashlib
import urllib
import pybedtools
import gffutils
import metaseq

logging.basicConfig(
    level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger('metaseq data download')

hg19 = pybedtools.chromsizes('hg19')
genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19')

usage = """
Downloads data from UCSC, GEO, and Ensembl.
"""

import argparse
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument(
    '--data-dir',
    default=metaseq.data_dir(),
    help='Location to store downloaded and prepped data.  '
    'Default is %(default)s')
args = ap.parse_args()

CHROM = 'chr17'
Example #29
0
def parallel_apply(orig_bedtool, method, genome=None, genome_fn=None,
                   method_args=None, method_kwargs=None, shuffle_kwargs=None,
                   shuffle=True, reduce_func=None, processes=1, sort=False,
                   _orig_pool=None, iterations=1000, debug=False,
                   report_iterations=False):
    """
    Call an arbitrary BedTool method many times in parallel.

    An example use-case is to generate a null distribution of intersections,
    and then compare this to the actual intersections.

    **Important:** due to a known file handle leak in BedTool.__len__, it's
    best to simply check the number of lines in the file, as in the below
    function. This works because BEDTools programs strip any non-interval lines
    in the results.

    >>> # set up example BedTools
    >>> a = pybedtools.example_bedtool('a.bed')
    >>> b = pybedtools.example_bedtool('b.bed')

    >>> # Method of `a` to call:
    >>> method = 'intersect'

    >>> # Kwargs provided to `a.intersect` each iteration
    >>> method_kwargs = dict(b=b, u=True)

    >>> # Function that will be called on the results of
    >>> # `a.intersect(**method_kwargs)`.
    >>> def reduce_func(x):
    ...     return sum(1 for _ in open(x.fn))

    >>> # Create a small artificial genome for this test (generally you'd
    >>> # use an assembly name, like "hg19"):
    >>> genome = dict(chr1=(0, 1000))

    >>> # Do 10 iterations using 1 process for this test (generally you'd
    >>> # use 1000+ iterations, and as many processes as you have CPUs)
    >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome,
    ... method_kwargs=method_kwargs, iterations=10, processes=1,
    ... reduce_func=reduce_func, debug=True, report_iterations=True)

    >>> # get results
    >>> print list(results)
    [2, 2, 3, 0, 3, 3, 0, 0, 2, 4]

    >>> # We can compare this to the actual intersection:
    >>> reduce_func(a.intersect(**method_kwargs))
    3

    Alternatively, we could use the `a.jaccard` method, which already does the
    reduction to a dictionary.  However, the Jaccard method requires the input
    to be sorted.  Here, we specify `sort=True` to sort each shuffled BedTool
    before calling its `jaccard` method.

    >>> from pybedtools.parallel import parallel_apply
    >>> a = pybedtools.example_bedtool('a.bed')
    >>> results = parallel_apply(a, method='jaccard', method_args=(b,),
    ... genome=genome, iterations=3, processes=1, sort=True, debug=True)
    >>> for i in results:
    ...     print sorted(i.items())
    [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)]
    [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)]
    [('intersection', 45), ('jaccard', 0.0818182), ('n_intersections', 1), ('union-intersection', 550)]

    Parameters
    ----------
    orig_bedtool : BedTool

    method : str
        The method of `orig_bedtool` to run

    method_args : tuple
        Passed directly to getattr(orig_bedtool, method)()

    method_kwargs : dict
        Passed directly to getattr(orig_bedtool, method)()

    shuffle : bool
        If True, then `orig_bedtool` will be shuffled at each iteration and
        that shuffled version's `method` will be called with `method_args` and
        `method_kwargs`.

    shuffle_kwargs : dict
        If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`.
        You do not need to pass the genome here; that's handled separately by
        the `genome` and `genome_fn` kwargs.

    iterations : int
        Number of iterations to perform

    genome : string or dict
        If string, then assume it is the assembly name (e.g., hg19) and get
        a dictionary of chromsizes for that assembly, then converts to
        a filename.

    genome_fn : str
        Mutually exclusive with `genome`; `genome_fn` must be an existing
        filename with the chromsizes.  Use the `genome` kwarg instead if you'd
        rather supply an assembly or dict.

    reduce_func : callable
        Function or other callable object that accepts, as its only argument,
        the results from `orig_bedtool.method()`.  For example, if you care
        about the number of results, then you can use `reduce_func=len`.

    processes : int
        Number of processes to run.  If `processes=1`, then multiprocessing is
        not used (making it much easier to debug).  This argument is ignored if
        `_orig_pool` is provided.

    sort : bool
        If both `shuffle` and `sort` are True, then the shuffled BedTool will
        then be sorted.  Use this if `method` requires sorted input.

    _orig_pool : multiprocessing.Pool instance
        If provided, uses `_orig_pool` instead of creating one.  In this case,
        `processes` will be ignored.

    debug : bool
        If True, then use the current iteration index as the seed to shuffle.

    report_iterations : bool
        If True, then report the number of iterations to stderr.
    """

    shuffle_kwargs = shuffle_kwargs or {}
    method_args = method_args or ()
    if not isinstance(method_args, list) and not isinstance(method_args, tuple):
        raise ValueError(
            "method_args must be a list or tuple, got %s" % type(method_args))
    method_kwargs = method_kwargs or {}

    if genome_fn and genome:
        raise ValueError("only of of genome_fn or genome should be provided")

    if shuffle:
        if not genome_fn:
            if not genome:
                raise ValueError("shuffle=True, so either genome_fn"
                                 " or genome must be provided")
            genome_fn = pybedtools.chromsizes_to_file(genome)

    _parallel_wrap_kwargs = dict(
        orig_bedtool=orig_bedtool,
        shuffle_kwargs=shuffle_kwargs,
        genome_fn=genome_fn,
        method=method,
        method_args=method_args,
        method_kwargs=method_kwargs,
        shuffle=shuffle,
        reduce_func=reduce_func,
        sort=sort,
    )

    def add_seed(i, kwargs):
        if debug and shuffle:
            kwargs['shuffle_kwargs']['seed'] = i
        return kwargs

    if processes == 1:
        for it in range(iterations):
            yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs))
        raise StopIteration

    if _orig_pool:
        p = _orig_pool
    else:
        p = multiprocessing.Pool(processes)

    results = [
        p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs))
        for it in range(iterations)]
    for i, r in enumerate(results):
        yield r.get()
        if report_iterations:
            sys.stderr.write('%s\r' % i)
            sys.stderr.flush()
    raise StopIteration
Example #30
0
def parallel_apply(orig_bedtool,
                   method,
                   genome=None,
                   genome_fn=None,
                   method_args=None,
                   method_kwargs=None,
                   shuffle_kwargs=None,
                   shuffle=True,
                   reduce_func=None,
                   processes=1,
                   sort=False,
                   _orig_pool=None,
                   iterations=1000,
                   debug=False,
                   report_iterations=False):
    """
    Call an arbitrary BedTool method many times in parallel.

    An example use-case is to generate a null distribution of intersections,
    and then compare this to the actual intersections.

    **Important:** due to a known file handle leak in BedTool.__len__, it's
    best to simply check the number of lines in the file, as in the below
    function. This works because BEDTools programs strip any non-interval lines
    in the results.

    >>> # set up example BedTools
    >>> a = pybedtools.example_bedtool('a.bed')
    >>> b = pybedtools.example_bedtool('b.bed')

    >>> # Method of `a` to call:
    >>> method = 'intersect'

    >>> # Kwargs provided to `a.intersect` each iteration
    >>> method_kwargs = dict(b=b, u=True)

    >>> # Function that will be called on the results of
    >>> # `a.intersect(**method_kwargs)`.
    >>> def reduce_func(x):
    ...     return sum(1 for _ in open(x.fn))

    >>> # Create a small artificial genome for this test (generally you'd
    >>> # use an assembly name, like "hg19"):
    >>> genome = dict(chr1=(0, 1000))

    >>> # Do 10 iterations using 1 process for this test (generally you'd
    >>> # use 1000+ iterations, and as many processes as you have CPUs)
    >>> results = pybedtools.parallel.parallel_apply(a, method, genome=genome,
    ... method_kwargs=method_kwargs, iterations=10, processes=1,
    ... reduce_func=reduce_func, debug=True, report_iterations=True)

    >>> # get results
    >>> print(list(results))
    [2, 2, 3, 0, 3, 3, 0, 0, 2, 4]

    >>> # We can compare this to the actual intersection:
    >>> reduce_func(a.intersect(**method_kwargs))
    3

    Alternatively, we could use the `a.jaccard` method, which already does the
    reduction to a dictionary.  However, the Jaccard method requires the input
    to be sorted.  Here, we specify `sort=True` to sort each shuffled BedTool
    before calling its `jaccard` method.

    >>> from pybedtools.parallel import parallel_apply
    >>> a = pybedtools.example_bedtool('a.bed')
    >>> results = parallel_apply(a, method='jaccard', method_args=(b,),
    ... genome=genome, iterations=3, processes=1, sort=True, debug=True)
    >>> for i in results:
    ...     print(sorted(i.items()))
    [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)]
    [('intersection', 15), ('jaccard', 0.0238095), ('n_intersections', 2), ('union-intersection', 630)]
    [('intersection', 45), ('jaccard', 0.0818182), ('n_intersections', 1), ('union-intersection', 550)]

    Parameters
    ----------
    orig_bedtool : BedTool

    method : str
        The method of `orig_bedtool` to run

    method_args : tuple
        Passed directly to getattr(orig_bedtool, method)()

    method_kwargs : dict
        Passed directly to getattr(orig_bedtool, method)()

    shuffle : bool
        If True, then `orig_bedtool` will be shuffled at each iteration and
        that shuffled version's `method` will be called with `method_args` and
        `method_kwargs`.

    shuffle_kwargs : dict
        If `shuffle` is True, these are passed to `orig_bedtool.shuffle()`.
        You do not need to pass the genome here; that's handled separately by
        the `genome` and `genome_fn` kwargs.

    iterations : int
        Number of iterations to perform

    genome : string or dict
        If string, then assume it is the assembly name (e.g., hg19) and get
        a dictionary of chromsizes for that assembly, then converts to
        a filename.

    genome_fn : str
        Mutually exclusive with `genome`; `genome_fn` must be an existing
        filename with the chromsizes.  Use the `genome` kwarg instead if you'd
        rather supply an assembly or dict.

    reduce_func : callable
        Function or other callable object that accepts, as its only argument,
        the results from `orig_bedtool.method()`.  For example, if you care
        about the number of results, then you can use `reduce_func=len`.

    processes : int
        Number of processes to run.  If `processes=1`, then multiprocessing is
        not used (making it much easier to debug).  This argument is ignored if
        `_orig_pool` is provided.

    sort : bool
        If both `shuffle` and `sort` are True, then the shuffled BedTool will
        then be sorted.  Use this if `method` requires sorted input.

    _orig_pool : multiprocessing.Pool instance
        If provided, uses `_orig_pool` instead of creating one.  In this case,
        `processes` will be ignored.

    debug : bool
        If True, then use the current iteration index as the seed to shuffle.

    report_iterations : bool
        If True, then report the number of iterations to stderr.
    """

    shuffle_kwargs = shuffle_kwargs or {}
    method_args = method_args or ()
    if not isinstance(method_args, list) and not isinstance(
            method_args, tuple):
        raise ValueError("method_args must be a list or tuple, got %s" %
                         type(method_args))
    method_kwargs = method_kwargs or {}

    if genome_fn and genome:
        raise ValueError("only of of genome_fn or genome should be provided")

    if shuffle:
        if not genome_fn:
            if not genome:
                raise ValueError("shuffle=True, so either genome_fn"
                                 " or genome must be provided")
            genome_fn = pybedtools.chromsizes_to_file(genome)

    _parallel_wrap_kwargs = dict(
        orig_bedtool=orig_bedtool,
        shuffle_kwargs=shuffle_kwargs,
        genome_fn=genome_fn,
        method=method,
        method_args=method_args,
        method_kwargs=method_kwargs,
        shuffle=shuffle,
        reduce_func=reduce_func,
        sort=sort,
    )

    def add_seed(i, kwargs):
        if debug and shuffle:
            kwargs['shuffle_kwargs']['seed'] = i
        return kwargs

    if processes == 1:
        for it in range(iterations):
            yield _parallel_wrap(**add_seed(it, _parallel_wrap_kwargs))
        raise StopIteration

    if _orig_pool:
        p = _orig_pool
    else:
        p = multiprocessing.Pool(processes)

    results = [
        p.apply_async(_parallel_wrap, (), add_seed(it, _parallel_wrap_kwargs))
        for it in range(iterations)
    ]
    for i, r in enumerate(results):
        yield r.get()
        if report_iterations:
            sys.stderr.write('%s\r' % i)
            sys.stderr.flush()
    raise StopIteration
Example #31
0
def main():
    """
    Creates a pairwise matrix containing overlapping feature counts for many
    BED files
    """
    ap = argparse.ArgumentParser(usage=usage)
    ap.add_argument('beds', nargs="*", help='BED/GTF/GFF/VCF filenames, e.g., '
                    'in a directory of bed files, you can use *.bed')
    ap.add_argument('--frac', action='store_true',
                    help='Instead of counts, report fraction overlapped')
    ap.add_argument('--enrichment', action='store_true',
                    help='Run randomizations (default 1000, specify otherwise '
                    'with --iterations) on each pairwise comparison and '
                    'compute the enrichment score as '
                    '(actual intersection count + 1) / (median randomized + 1)'
                    )
    ap.add_argument('--genome', help='Required argument if --enrichment is '
                    'used. Needs to be a string assembly name like "dm3" or '
                    '"hg19"')
    ap.add_argument('--iterations', default=1000, type=int,
                    help='Number of randomizations to perform for enrichement '
                    'scores')
    ap.add_argument('--processes', default=None, type=int,
                    help='Number of CPUs to use for randomization')
    ap.add_argument('--test', action='store_true', help='Ignore any input BED '
                    'files and use test BED files')
    ap.add_argument('-v', '--verbose', action='store_true',
                    help='Be verbose: print which files are '
                    'currently being intersected and timing info at the end.')
    args = ap.parse_args()

    if not args.beds and not args.test:
        ap.print_help()
        sys.exit(1)

    if args.test:
        # insulator binding sites from ChIP-chip -- 4 proteins, 2 cell types
        # Genes Dev. 2009 23(11):1338-1350
        args.beds = [example_filename(i) for i in  [
                'Cp190_Kc_Bushey_2009.bed',
                'Cp190_Mbn2_Bushey_2009.bed',
                'CTCF_Kc_Bushey_2009.bed',
                'CTCF_Mbn2_Bushey_2009.bed',
                'SuHw_Kc_Bushey_2009.bed',
                'SuHw_Mbn2_Bushey_2009.bed',
                'BEAF_Mbn2_Bushey_2009.bed',
                'BEAF_Kc_Bushey_2009.bed'
                ]]

    if args.enrichment:
        FUNC = enrichment_score
        genome_fn = pybedtools.chromsizes_to_file(pybedtools.chromsizes(args.genome))
        kwargs = dict(genome_fn=genome_fn, iterations=args.iterations,
                processes=args.processes)

    elif args.frac:
        FUNC = frac_of_a
        kwargs = {}
    else:
        FUNC = actual_intersection
        kwargs = {}

    t0 = time.time()
    matrix = create_matrix(beds=args.beds, func=FUNC, verbose=args.verbose, **kwargs)
    t1 = time.time()

    nfiles = len(args.beds)

    if args.verbose:
        sys.stderr.write('Time to construct %s x %s matrix: %.1fs' \
                % (nfiles, nfiles, (t1 - t0)) + '\n')
    keys = sorted(matrix.keys())

    sys.stdout.write("\t" + "\t".join(keys) + '\n')
    for k in keys:
        sys.stdout.write(k)
        for j in keys:
            sys.stdout.write('\t' + str(matrix[k][j]))
        sys.stdout.write('\n')
for chrom, size in chromsizes.items():
    fasta.write(">" + chrom + "\n")
    n, r = divmod(size[1], 80)
    for _ in range(n):
        fasta.write(random_dna(80))
    fasta.write(random_dna(r))

cmds = ["faToTwoBit", fasta.name, fasta.name[0:-2] + "2bit"]

p = subprocess.check_call(cmds)

fasta.close()
os.unlink(fasta.name)

g = pybedtools.chromsizes_to_file(chromsizes)

# Make some randomized bigBed files
for i in range(3):
    x = pybedtools.BedTool(
        "chr1 0 100000", from_string=True)\
        .window_maker(g=g, w=100 * (i + 1))\
        .shuffle(g=g, seed=i)\
        .sort()

    out = os.path.join(data_dir, 'random-no1-%s.bigBed' % i)

    cmds = ['bedToBigBed', x.fn, g, out]
    p = subprocess.check_call(cmds)

Example #33
0
def enrichment(id,a, b,background, organism,name=None, score=None, strand=None, n=10, run=[]):
	"""Perform enrichment analysis between two BED files.

	a - path to Feature of Interest BED file (FOI)
	b - path to Genomic Feature BED file (GF)
	n - number of Monte-Carlo iterations
	"""
	write_debug("START",True)
	r = {}
	e = Enrichment_Par(a=a,b=b,organism=organism,n=n,background=background)

	if os.path.exists(e.background):
		e = e.replace(Background = BedTool(e.background))

	e = e._replace(A = BedTool(str(e.a)))
	e = e._replace(B = BedTool(str(e.b)))
	e = e._replace(genome = pybedtools.get_chromsizes_from_ucsc(e.organism))
	e = e._replace(genome_fn = pybedtools.chromsizes_to_file(e.genome))

	e = e._replace(organism = str(e.organism))
	flt = make_filter(name,score,strand)
	e = e._replace(B = e.B.filter(flt).saveas())
	e = e._replace(nA = len(e.A))
	e = e._replace(nB = len(e.B))
	# Exits if there are 0 GFs or 0 FOI
	if not e.nA or not e.nB:
		logger.info("Filter resulted in 0 Features of Interest. Terminating Run. {} (id={})".format(b,id))
		return Enrichment(e.a,basename(e.b),e.nA,e.nB,"NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA")

	e.A.set_chromsizes(e.genome)
	e.B.set_chromsizes(e.genome)
	e = e._replace(obs = len(e.A.intersect(e.B, u=True)))
	# This is the Monte-Carlo step.  If custom background present, it is used
	if 'pvalue' in run:
		logger.info("Running Monte Carlo ({}): (id={})".format(b,id))
		write_progress(id, "Running Monte Carlo {}".format(b))
		r.update(run_montecarlo(e))
	else:
		r['p_value'], r['exp'] = "NA","NA"
		logger.info("Skipping Monte Carlo ({}): (id={})".format(b,id))
	## Uncomment to print global parameters in debug file
	#write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background,
	#			n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs)
	
	# expected caluclated using pybed method CANNOT use custom background
	if 'pybedtool' in run:
		logger.info("Running Random Intersections ({}): (id={})".format(b,id))
		write_progress(id, "Running Random Intersections: {0}".format(b))
		r.update( run_pybedtool(e))
	else:
		r['pybedp_value'], r['pybed_exp'] = "NA","NA"
		logger.info("Skipping Random Intersections")

	# epected calculated using jaccard method
	if 'jaccard' in run:
		logger.info("Running Jaccard ({}): (id={})".format(e.b,id))
		write_progress(id, "Running Jaccard {}".format(e.b))
		r.update( run_jaccard(e))
	else:
		r['jaccardp_value'], r['jaccard_obs'],r['jaccard_exp'] = "NA","NA","NA"
		logger.info("Skipping Jaccard ({}): (id={})".format(b,id))
	
	# run kolmogorov-smornov test
	if 'kolmogorov' in run:
		logger.info("Running Kolmogorov-Smornov {} (id={})".format(b,id))
		write_progress(id, "Running Kolmogorov-Smornov{}".format(b))
		r.update( run_kolmogorov(e))
	else:
		r['kol_smor_p_value'] = "NA"
		logger.info("Skipping Kolmogorov-Smornov {} (id={})".format(b,id))
	
	# run proximity analysis
	if 'proximity' in run:
		logger.info("Running proximity {} (id={})".format(b,id))
		write_progress(id, "Running proximity analysis{}".format(b))
		r.update( run_proximity(e))
	else:
		logger.info( "Skipping Proximity")
		r['obsprox'],r['expprox'],r['proximityp_value']="NA","NA", "NA" 

	# run hypergeometric distrubtion analysis
	if 'hypergeometric' in run:
		write_progress(id,"Running")
		logger.info("Running hypergeometric analysis {} (id={})".format(b,id))
		r.update(run_hypergeometric(e))
	else:
		logger.info("Skipping hypergeometric")
		r['hypergeometric_p_value'] = "NA"
	## Uncomment to print global parameters in debug file
	#write_debug("Global parameters",a = e.a,b=e.b,A= e.A,B=e.B,background = e.background,
	#		n=e.n,flt = e.flt,genome = e.genome,genome_fn = e.genome_fn,organism = e.organism,obs = e.obs)
	# the order of these arguments IS IMPORTANT
	return Enrichment(e.a, basename(e.b), e.nA, e.nB, e.obs, r['exp'], r['p_value'],r['obsprox'],\
			r['expprox'],r['pybedp_value'],r['pybed_exp'],r['jaccard_obs'],r['jaccardp_value'],\
			r['jaccard_exp'],r['proximityp_value'],r['kol_smor_p_value'],r['hypergeometric_p_value'])