logging.basicConfig(
    level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger('metaseq data download')

hg19 = pybedtools.chromsizes('hg19')
genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19')

usage = """
Downloads data from UCSC, GEO, and Ensembl.
"""

import argparse
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument(
    '--data-dir',
    default=metaseq.data_dir(),
    help='Location to store downloaded and prepped data.  '
    'Default is %(default)s')
args = ap.parse_args()

CHROM = 'chr17'
COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1])


def download(url, dest):
    """
    Platform-agnostic downloader.
    """
    u = urllib.FancyURLopener()
    logger.info("Downloading %s..." % url)
    u.retrieve(url, dest)
#! /usr/bin/python

# Download large data for running metaseq tests.
#
# > 1.6 GB of downloads

import sys
import os
import hashlib
import gffutils
import pybedtools
import metaseq

DATA_DIR = metaseq.data_dir()

# md5 hex digests for example files ===========================================

# ENCODE data
DATA = """
ff6979ace9befe82e71b6a05609d36e1  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam
ab2f3d2efd5a0281092e7ad542dfad36  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam.bai
fa20b05ea082dcb063463b73b6a5af2f  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101RawRep1.bigWig
b0716bd81170efe1fd0a8e411fb669d8  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam
6e8f85d3ab428ef95e3382237b1b2419  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101PkRep1.broadPeak.gz
cf869424dc915e59d9f1b3f73d720883  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam.bai
fb3b9dc8e85636a3a1226d22f8c1dbec  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101RawRep1.bigWig
"""

# Ensembl annotations
DATA += """
25e76f628088daabd296447d06abe16b  ftp://ftp.ensembl.org/pub/release-66/gtf/homo_sapiens/Homo_sapiens.GRCh37.66.gtf.gz
logging.basicConfig(level=logging.DEBUG,
                    format='[%(name)s] [%(asctime)s]: %(message)s')
logger = logging.getLogger('metaseq data download')

hg19 = pybedtools.chromsizes('hg19')
genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19')

usage = """
Downloads data from UCSC, GEO, and Ensembl.
"""

import argparse
ap = argparse.ArgumentParser(usage=usage)
ap.add_argument('--data-dir',
                default=metaseq.data_dir(),
                help='Location to store downloaded and prepped data.  '
                'Default is %(default)s')
args = ap.parse_args()

CHROM = 'chr17'
COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1])


def download(url, dest):
    """
    Platform-agnostic downloader.
    """
    u = urllib.FancyURLopener()
    logger.info("Downloading %s..." % url)
    u.retrieve(url, dest)
Exemple #4
0
#! /usr/bin/python

# Download large data for running metaseq tests.
#
# > 1.6 GB of downloads

import sys
import os
import hashlib
import gffutils
import pybedtools
import metaseq

DATA_DIR = metaseq.data_dir()

# md5 hex digests for example files ===========================================

# ENCODE data
DATA = """
ff6979ace9befe82e71b6a05609d36e1  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam
ab2f3d2efd5a0281092e7ad542dfad36  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam.bai
fa20b05ea082dcb063463b73b6a5af2f  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101RawRep1.bigWig
b0716bd81170efe1fd0a8e411fb669d8  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam
6e8f85d3ab428ef95e3382237b1b2419  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101PkRep1.broadPeak.gz
cf869424dc915e59d9f1b3f73d720883  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam.bai
fb3b9dc8e85636a3a1226d22f8c1dbec  http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101RawRep1.bigWig
"""

# Ensembl annotations
DATA += """
25e76f628088daabd296447d06abe16b  ftp://ftp.ensembl.org/pub/release-66/gtf/homo_sapiens/Homo_sapiens.GRCh37.66.gtf.gz
Exemple #5
0
    '--stop',
    type=int,
    default=5000000,
    help='stop coord of possible windows. See --nfeatures help regarding size.'
)
ap.add_argument('--chrom',
                default='chr2L',
                help='chromsome to make windows on')
ap.add_argument('--type',
                default='all',
                help='Only use the specified file types.  Either "all" '
                '(default) or a comma-separated list of [bam, bigwig, bed, '
                'bigbed].')
ap.add_argument(
    '--prefix',
    default=os.path.join(metaseq.data_dir(), 'x'),
    help='Prefix of filenames to use.  Expects files '
    'with this prefix, and the following suffixes: .bam, .bigwig, '
    '.bed.gz (should already be tabixed), .bigbed.  Default: %(default)s')
ap.add_argument(
    '--plot-prefix',
    default='./speedtest',
    help='Filename used to save the resulting plot. Default is %(default)s')
ap.add_argument('--bins', default=100, help='Number of bins for each feature')
args = ap.parse_args()

requested = args.type.split(',')
allowed = ['bam', 'bed', 'bigwig', 'bigbed', 'all']
for req in requested:
    if req not in allowed:
        raise ValueError("%s not in %s" % (req, allowed))
Exemple #6
0
ap.add_argument(
    '--start', type=int, default=10000,
    help='start coord of possible windows. See --nfeatures help regarding size.')
ap.add_argument(
    '--stop', type=int, default=5000000,
    help='stop coord of possible windows. See --nfeatures help regarding size.')
ap.add_argument(
    '--chrom', default='chr2L',
    help='chromsome to make windows on')
ap.add_argument(
    '--type', default='all',
    help='Only use the specified file types.  Either "all" '
    '(default) or a comma-separated list of [bam, bigwig, bed, '
    'bigbed].')
ap.add_argument(
    '--prefix', default=os.path.join(metaseq.data_dir(), 'x'),
    help='Prefix of filenames to use.  Expects files '
    'with this prefix, and the following suffixes: .bam, .bigwig, '
    '.bed.gz (should already be tabixed), .bigbed.  Default: %(default)s')
ap.add_argument(
    '--plot-prefix', default='./speedtest',
    help='Filename used to save the resulting plot. Default is %(default)s')
ap.add_argument(
    '--bins', default=100,
    help='Number of bins for each feature')
args = ap.parse_args()


requested = args.type.split(',')
allowed = ['bam', 'bed', 'bigwig', 'bigbed', 'all']
for req in requested: