accession_pat = re.compile('ACC.*ID="([^"]*)";') with DeleteOnError(outputfile, gzip.open) as output: for line in urllib.urlopen(Paths.MIRBASE_URL): if line.startswith('#'): continue fields = line[:-1].split('\t') name = accession_pat.findall(fields[8])[0] print >> output, '\t'.join([ 'chr'+fields[0], str(int(fields[3])-1), fields[4], 'miRNA|%s|%s' % (name, name), '.', fields[6] ]) @files([(None, Paths.repeatmasker_original(chrom), chrom) for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES]) @jobs_limit(Options.MAX_PARALLEL_DOWNLOADING, 'download') def download_repeatmasker_catalogs(inputfile, outputfile, chrom): import urllib url = Paths.REPEATMASKER_URL % (Options.GENOME, chrom) print 'Downloading %s ...' % url urllib.urlretrieve(url, outputfile) @files([Paths.repeatmasker_original(chrom) for chrom in Options.UCSC_REPEATMASKER_CHROMOSOMES], Paths.repeatmasker_catalog) @follows(download_repeatmasker_catalogs) def merge_repeatmasker_catalogs(inputfiles, outputfile):