def hgLoadSqlTab(db, tableName, sqlName, tabFname, optString=""): if isfile(tabFname): cmd = "hgLoadSqlTab %s %s %s %s %s" % (db, tableName, sqlName, tabFname, optString) maxCommon.runCommand(cmd, verbose=False) else: logging.warn("file %s not found" % tabFname)
def allResults(self): """ given a list of rows with sentences as their -1 field, run these through the MSR pipeline """ tstart = datetime.now() inFh, tempFnameIn = writeMsrIn(self.rows) logging.info("Running MSR pipeline on %d sentences" % len(self.rows)) #logging.info("Running MSR pipeline on %s " % sentences) ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt") cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut) maxCommon.runCommand(cmd) joinedRows = [] logging.info("Parsing MSR output") for msrRow in parseMsrOut(tempFnameOut): textRow = list(self.rows[int(msrRow.chunkSentId)]) textRow.extend(msrRow) joinedRows.append(textRow) inFh.close() ofh2.close() logging.debug("results " + repr(joinedRows)) tend = datetime.now() secs = (tend - tstart).seconds logging.info("msr runtime: %d" % secs) return joinedRows
def allResults(self): """ given a list of rows with sentences as their -1 field, run these through the MSR pipeline """ tstart = datetime.now() inFh, tempFnameIn = writeMsrIn(self.rows) logging.info("Running MSR pipeline on %d sentences" % len(self.rows)) #logging.info("Running MSR pipeline on %s " % sentences) ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt") cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut) maxCommon.runCommand(cmd) joinedRows = [] logging.info("Parsing MSR output") for msrRow in parseMsrOut(tempFnameOut): textRow = list(self.rows[int(msrRow.chunkSentId)]) textRow.extend(msrRow) joinedRows.append(textRow) inFh.close() ofh2.close() logging.debug("results " + repr(joinedRows)) tend = datetime.now() secs = (tend-tstart).seconds logging.info("msr runtime: %d" % secs) return joinedRows
def _gzipAndMove(self, fname, finalName): " gzip fname and move to finalName " gzName = fname+".gz" if isfile(gzName): os.remove(gzName) maxCommon.runCommand("gzip %s" % fname) logging.debug("compressing and copying files table to %s" % finalName) shutil.copyfile(gzName, finalName) os.remove(gzName)
def listTables(db, expr): " return list of table names that match mysql expr " tmpFile = tempfile.NamedTemporaryFile(prefix="pubBlat.dropTables") tmpName = tmpFile.name cmd = """hgsql %s -NB -e 'show tables like "%s"' > %s """ % (db, expr, tmpName) maxCommon.runCommand(cmd) lines = open(tmpName).readlines() lines = [l.strip() for l in lines] return lines
def startRedis(dbFname): """ starts redis on current server as daemon. Creates status files with filename dbName".pid" and dbName".host". Returns the port. >>> import pubGeneric >>> pubGeneric.setupLogging(__file__, None) >>> h, p = startRedis("/tmp/test.tab.gz") >>> r = redis.Redis(port=p) >>> r.set("hello", "world") True >>> r.get("hello") 'world' >>> r.get("world") >>> r.shutdown() """ dbFname = abspath(dbFname) pidFname = dbFname + ".pid" port = findFreePort() dirName = dirname(dbFname) baseName = basename(dbFname) + ".rdb" hostFname = dbFname + ".host" hostname = socket.gethostbyname("localhost") hostDesc = hostname + ":" + str(port) open(hostFname, "w").write(hostDesc) logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname)) maxCommon.delOnExit(hostFname) maxCommon.delOnExit(pidFname) atexit.register(shutdownRedisServers) global redisPorts redisPorts.append(port) cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \ "--port", str(port), "--rdbchecksum", "no", "--dir", dirName, "--dbfilename", baseName, "--maxmemory", "200gb"] logging.info("Starting up redis server on localhost") maxCommon.runCommand(cmd) # wait until startup is complete redisStart = True while redisStart: try: r = redis.Redis(port=port) dbSize = r.dbsize() redisStart = False except redis.ConnectionError: logging.info("Waiting for 1 sec for redis startup completion") time.sleep(1) pass logging.info("Redis startup completed, dbSize=%d" % dbSize) return "localhost", port
def startRedis(dbFname): """ starts redis on current server as daemon. Creates status files with filename dbName".pid" and dbName".host". Returns the port. >>> import pubGeneric >>> pubGeneric.setupLogging(__file__, None) >>> h, p = startRedis("/tmp/test.tab.gz") >>> r = redis.Redis(port=p) >>> r.set("hello", "world") True >>> r.get("hello") 'world' >>> r.get("world") >>> r.shutdown() """ dbFname = abspath(dbFname) pidFname = dbFname+".pid" port = findFreePort() dirName = dirname(dbFname) baseName = basename(dbFname)+".rdb" hostFname = dbFname+".host" hostname = socket.gethostbyname("localhost") hostDesc = hostname+":"+str(port) open(hostFname, "w").write(hostDesc) logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname)) maxCommon.delOnExit(hostFname) maxCommon.delOnExit(pidFname) atexit.register(shutdownRedisServers) global redisPorts redisPorts.append(port) cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \ "--port", str(port), "--rdbchecksum", "no", "--dir", dirName, "--dbfilename", baseName, "--maxmemory", "200gb"] logging.info("Starting up redis server on localhost") maxCommon.runCommand(cmd) # wait until startup is complete redisStart = True while redisStart: try: r = redis.Redis(port=port) dbSize = r.dbsize() redisStart=False except redis.ConnectionError: logging.info("Waiting for 1 sec for redis startup completion") time.sleep(1) pass logging.info("Redis startup completed, dbSize=%d" % dbSize) return "localhost", port
def hgGetAllRows(db, tableName, tempDir): " return all rows of table as a list of tuples " query = "SELECT * from %s" % tableName tempFile = tempfile.NamedTemporaryFile(prefix="maxMysql_hgGetAllRows", dir=tempDir) cmd = 'hgsql %s -NB -e "%s" > %s' % (db, query, tempFile.name) maxCommon.runCommand(cmd) data = [] for line in open(tempFile.name, "r"): row = line.strip("\n").split("\t") data.append(row) return data
def renameTables(db, fromList, toList, checkExists=False): " rename tables from old to new, fromToList is a list of 2-tuples " assert(len(fromList)==len(toList)) logging.debug("Renaming mysql tables %s to %s" % (fromList, toList)) parts = [] for oldName, newName in zip(fromList, toList): if (not checkExists) or (checkExists and tableExists(db, oldName)): parts.append("%s TO %s" % (oldName, newName)) else: logging.debug("Could not find table %s, %s" % (db, oldName)) if len(parts)==0: logging.debug("No table found, not renaming anything") return sqlCmd = "RENAME TABLE "+", ".join(parts) cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd) maxCommon.runCommand(cmd, verbose=False)
def renameTables(db, fromList, toList, checkExists=False): " rename tables from old to new, fromToList is a list of 2-tuples " assert (len(fromList) == len(toList)) logging.debug("Renaming mysql tables %s to %s" % (fromList, toList)) parts = [] for oldName, newName in zip(fromList, toList): if (not checkExists) or (checkExists and tableExists(db, oldName)): parts.append("%s TO %s" % (oldName, newName)) else: logging.debug("Could not find table %s, %s" % (db, oldName)) if len(parts) == 0: logging.debug("No table found, not renaming anything") return sqlCmd = "RENAME TABLE " + ", ".join(parts) cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd) maxCommon.runCommand(cmd, verbose=False)
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd = ["unzip", "-d", tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line == "": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert (len(tmpFnames) == 1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName + ".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd =["unzip", "-d",tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line=="": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert(len(tmpFnames)==1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName+".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def blatFasta(self, db, faFname, params=[]): """ blat fasta files against a db, create temporary write psl files returns a (file, filename) of temp file """ seqDir = join(self.seqDir, db) outFnames = defaultdict(list) logging.debug("Blatting %s against %s" % (faFname, seqDir)) server, port = self.blatServers[db] tmpFh, tmpFname = pubGeneric.makeTempFile("blatOut.") cmd1 = ["gfClient", server, str(port), seqDir, faFname, "stdout", "-nohead"] cmd1.extend(params) cmd2 = ["sort", "-k10,10 "] cmd3 = ["pslCDnaFilter", "stdin", tmpFname,\ "-globalNearBest=0", "-filterWeirdOverlapped", "-ignoreIntrons"] cmds = [] cmds.append(" ".join(cmd1)) cmds.append(" ".join(cmd2)) cmds.append(" ".join(cmd3)) cmd = "|".join(cmds) maxCommon.runCommand(cmd) return tmpFh, tmpFname
def zipExtract(tmpDir, zipName, filename): """ extract filename in zipName to tmpDir, delete tmpfile and return as string thought that this was faster than python's zipfile, but it isn't """ cmd = ["unzip", "-d", tmpDir, zipName, filename] ret = maxCommon.runCommand(cmd, ignoreErrors=True) if ret!=0: return None tmpFname = join(tmpDir, filename) data = open(tmpFname).read() os.remove(tmpFname) return data
def zipExtract(tmpDir, zipName, filename): """ extract filename in zipName to tmpDir, delete tmpfile and return as string thought that this was faster than python's zipfile, but it isn't """ cmd = ["unzip", "-d", tmpDir, zipName, filename] ret = maxCommon.runCommand(cmd, ignoreErrors=True) if ret != 0: return None tmpFname = join(tmpDir, filename) data = open(tmpFname).read() os.remove(tmpFname) return data
def renameTablesRegex(db, exprOrList, fromStr, toStr): " rename tables that match mysql expr or are given as a list from regex fromStr to toStr " if isinstance(exprOrList, str): tables = listTables(db, exprOrList) else: tables = exprOrList reFrom = re.compile(fromStr) renameDesc = [] for oldTable in tables: newTable = reFrom.sub(toStr, oldTable) existTables = listTables(db, oldTable) if len(existTables) != 0: renameDesc.append([oldTable, newTable]) logging.debug("Renaming table %s -> %s" % (oldTable, newTable)) parts = [] for oldName, newName in renameDesc: parts.append("%s TO %s" % (oldName, newName)) sqlCmd = "RENAME TABLE " + ", ".join(parts) cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd) maxCommon.runCommand(cmd, verbose=False)
def renameTablesRegex(db, exprOrList, fromStr, toStr): " rename tables that match mysql expr or are given as a list from regex fromStr to toStr " if isinstance(exprOrList, str): tables = listTables(db, exprOrList) else: tables = exprOrList reFrom = re.compile(fromStr) renameDesc = [] for oldTable in tables: newTable = reFrom.sub(toStr, oldTable) existTables = listTables(db, oldTable) if len(existTables)!=0: renameDesc.append( [oldTable, newTable] ) logging.debug("Renaming table %s -> %s" % (oldTable, newTable)) parts = [] for oldName, newName in renameDesc: parts.append("%s TO %s" % (oldName, newName)) sqlCmd = "RENAME TABLE "+", ".join(parts) cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd) maxCommon.runCommand(cmd, verbose=False)
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 0, x, y, md5Str, pngBlob) ) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 1, x, y, md5Str, pngBlob) ) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data
def truncateTable(db, table): logging.debug("Truncating table %s" % table) cmd = """hgsql %s -NB -e 'truncate table %s'""" % (db, table) maxCommon.runCommand(cmd, verbose=False)
def dropTable(db, table): logging.debug("Dropping table %s" % table) cmd = """hgsql %s -NB -e 'drop table if exists %s'""" % (db, table) maxCommon.runCommand(cmd, verbose=False)
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ loadBlacklist() head = open(pdfName).read(30) if "<html" in head or "<HTML" in head: logging.info("PDF %s is an HTML file, skipping" % pdfName) return None logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) print "XX", md5Str, list(md5Blacklist)[:10] if md5Str in md5Blacklist: logging.debug("Image MD5 is blacklisted") continue data.append((imgId, 0, x, y, md5Str, pngBlob)) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append((imgId, 1, x, y, md5Str, pngBlob)) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data