def importRepeatsFromUCSC(infile, outfile, ucsc_database, repeattypes, genome): '''import repeats from a UCSC formatted file. The repeats are stored as a :term:`gff` formatted file. ''' repclasses = "','".join(repeattypes.split(",")) # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. dbhandle = PipelineUCSC.connectToUCSC( host=PARAMS["ucsc_host"], user=PARAMS["ucsc_user"], database=ucsc_database) cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") tmpfile = P.getTempFile(shared=True) total_repeats = 0 for table in tables: E.info("%s: loading repeats from %s" % (ucsc_database, table)) cc = dbhandle.execute( """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\";') FROM %(table)s WHERE repClass in ('%(repclasses)s') """ % locals()) n = 0 for data in cc.fetchall(): n += 1 tmpfile.write("\t".join(map(str, data)) + "\n") E.info("%s: %s=%i repeats downloaded" % (ucsc_database, table, n)) total_repeats += n if total_repeats == 0: raise ValueErrror("did not find any repeats for %s" % ucsc_database) tmpfile.close() tmpfilename = tmpfile.name statement = '''cat %(tmpfilename)s | %(pipeline_scriptsdir)s/gff_sort pos | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome)s --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run() os.unlink(tmpfilename)