logging.basicConfig( level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s') logger = logging.getLogger('metaseq data download') hg19 = pybedtools.chromsizes('hg19') genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19') usage = """ Downloads data from UCSC, GEO, and Ensembl. """ import argparse ap = argparse.ArgumentParser(usage=usage) ap.add_argument( '--data-dir', default=metaseq.data_dir(), help='Location to store downloaded and prepped data. ' 'Default is %(default)s') args = ap.parse_args() CHROM = 'chr17' COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1]) def download(url, dest): """ Platform-agnostic downloader. """ u = urllib.FancyURLopener() logger.info("Downloading %s..." % url) u.retrieve(url, dest)
#! /usr/bin/python # Download large data for running metaseq tests. # # > 1.6 GB of downloads import sys import os import hashlib import gffutils import pybedtools import metaseq DATA_DIR = metaseq.data_dir() # md5 hex digests for example files =========================================== # ENCODE data DATA = """ ff6979ace9befe82e71b6a05609d36e1 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam ab2f3d2efd5a0281092e7ad542dfad36 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101AlnRep1.bam.bai fa20b05ea082dcb063463b73b6a5af2f http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562RxlchV0416101RawRep1.bigWig b0716bd81170efe1fd0a8e411fb669d8 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam 6e8f85d3ab428ef95e3382237b1b2419 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101PkRep1.broadPeak.gz cf869424dc915e59d9f1b3f73d720883 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101AlnRep1.bam.bai fb3b9dc8e85636a3a1226d22f8c1dbec http://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeHaibTfbs/wgEncodeHaibTfbsK562Atf3V0416101RawRep1.bigWig """ # Ensembl annotations DATA += """ 25e76f628088daabd296447d06abe16b ftp://ftp.ensembl.org/pub/release-66/gtf/homo_sapiens/Homo_sapiens.GRCh37.66.gtf.gz
logging.basicConfig(level=logging.DEBUG, format='[%(name)s] [%(asctime)s]: %(message)s') logger = logging.getLogger('metaseq data download') hg19 = pybedtools.chromsizes('hg19') genomes_file = pybedtools.chromsizes_to_file('hg19', 'hg19') usage = """ Downloads data from UCSC, GEO, and Ensembl. """ import argparse ap = argparse.ArgumentParser(usage=usage) ap.add_argument('--data-dir', default=metaseq.data_dir(), help='Location to store downloaded and prepped data. ' 'Default is %(default)s') args = ap.parse_args() CHROM = 'chr17' COORD = "%s:%s-%s" % (CHROM, 0, hg19[CHROM][-1]) def download(url, dest): """ Platform-agnostic downloader. """ u = urllib.FancyURLopener() logger.info("Downloading %s..." % url) u.retrieve(url, dest)
'--stop', type=int, default=5000000, help='stop coord of possible windows. See --nfeatures help regarding size.' ) ap.add_argument('--chrom', default='chr2L', help='chromsome to make windows on') ap.add_argument('--type', default='all', help='Only use the specified file types. Either "all" ' '(default) or a comma-separated list of [bam, bigwig, bed, ' 'bigbed].') ap.add_argument( '--prefix', default=os.path.join(metaseq.data_dir(), 'x'), help='Prefix of filenames to use. Expects files ' 'with this prefix, and the following suffixes: .bam, .bigwig, ' '.bed.gz (should already be tabixed), .bigbed. Default: %(default)s') ap.add_argument( '--plot-prefix', default='./speedtest', help='Filename used to save the resulting plot. Default is %(default)s') ap.add_argument('--bins', default=100, help='Number of bins for each feature') args = ap.parse_args() requested = args.type.split(',') allowed = ['bam', 'bed', 'bigwig', 'bigbed', 'all'] for req in requested: if req not in allowed: raise ValueError("%s not in %s" % (req, allowed))
ap.add_argument( '--start', type=int, default=10000, help='start coord of possible windows. See --nfeatures help regarding size.') ap.add_argument( '--stop', type=int, default=5000000, help='stop coord of possible windows. See --nfeatures help regarding size.') ap.add_argument( '--chrom', default='chr2L', help='chromsome to make windows on') ap.add_argument( '--type', default='all', help='Only use the specified file types. Either "all" ' '(default) or a comma-separated list of [bam, bigwig, bed, ' 'bigbed].') ap.add_argument( '--prefix', default=os.path.join(metaseq.data_dir(), 'x'), help='Prefix of filenames to use. Expects files ' 'with this prefix, and the following suffixes: .bam, .bigwig, ' '.bed.gz (should already be tabixed), .bigbed. Default: %(default)s') ap.add_argument( '--plot-prefix', default='./speedtest', help='Filename used to save the resulting plot. Default is %(default)s') ap.add_argument( '--bins', default=100, help='Number of bins for each feature') args = ap.parse_args() requested = args.type.split(',') allowed = ['bam', 'bed', 'bigwig', 'bigbed', 'all'] for req in requested: