Beispiel #1
0
def hgLoadSqlTab(db, tableName, sqlName, tabFname, optString=""):
    if isfile(tabFname):
        cmd = "hgLoadSqlTab %s %s %s %s %s" % (db, tableName, sqlName,
                                               tabFname, optString)
        maxCommon.runCommand(cmd, verbose=False)
    else:
        logging.warn("file %s not found" % tabFname)
Beispiel #2
0
    def allResults(self):
        """ given a list of rows with sentences as their -1 field, run these through
        the MSR pipeline 
        """
        tstart = datetime.now()
        inFh, tempFnameIn = writeMsrIn(self.rows)
        logging.info("Running MSR pipeline on %d sentences" % len(self.rows))
        #logging.info("Running MSR pipeline on %s " % sentences)
        ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt")

        cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut)
        maxCommon.runCommand(cmd)

        joinedRows = []
        logging.info("Parsing MSR output")
        for msrRow in parseMsrOut(tempFnameOut):
            textRow = list(self.rows[int(msrRow.chunkSentId)])
            textRow.extend(msrRow)
            joinedRows.append(textRow)
        inFh.close()
        ofh2.close()
        logging.debug("results " + repr(joinedRows))

        tend = datetime.now()
        secs = (tend - tstart).seconds
        logging.info("msr runtime: %d" % secs)
        return joinedRows
Beispiel #3
0
    def allResults(self):
        """ given a list of rows with sentences as their -1 field, run these through
        the MSR pipeline 
        """
        tstart = datetime.now()
        inFh, tempFnameIn = writeMsrIn(self.rows)
        logging.info("Running MSR pipeline on %d sentences" % len(self.rows))
        #logging.info("Running MSR pipeline on %s " % sentences)
        ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt")

        cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut)
        maxCommon.runCommand(cmd)

        joinedRows = []
        logging.info("Parsing MSR output")
        for msrRow in parseMsrOut(tempFnameOut):
            textRow = list(self.rows[int(msrRow.chunkSentId)])
            textRow.extend(msrRow)
            joinedRows.append(textRow)
        inFh.close()
        ofh2.close()
        logging.debug("results " + repr(joinedRows))

        tend = datetime.now()
        secs = (tend-tstart).seconds
        logging.info("msr runtime: %d" % secs)
        return joinedRows
Beispiel #4
0
 def _gzipAndMove(self, fname, finalName):
     " gzip fname and move to finalName "
     gzName = fname+".gz"
     if isfile(gzName):
         os.remove(gzName)
     maxCommon.runCommand("gzip %s" % fname)
     logging.debug("compressing and copying files table to %s" % finalName)
     shutil.copyfile(gzName, finalName)
     os.remove(gzName)
Beispiel #5
0
def listTables(db, expr):
    " return list of table names that match mysql expr "
    tmpFile = tempfile.NamedTemporaryFile(prefix="pubBlat.dropTables")
    tmpName = tmpFile.name
    cmd = """hgsql %s -NB -e 'show tables like "%s"' > %s """ % (db, expr, tmpName)
    maxCommon.runCommand(cmd)

    lines = open(tmpName).readlines()
    lines = [l.strip() for l in lines]
    return lines
Beispiel #6
0
def listTables(db, expr):
    " return list of table names that match mysql expr "
    tmpFile = tempfile.NamedTemporaryFile(prefix="pubBlat.dropTables")
    tmpName = tmpFile.name
    cmd = """hgsql %s -NB -e 'show tables like "%s"' > %s """ % (db, expr,
                                                                 tmpName)
    maxCommon.runCommand(cmd)

    lines = open(tmpName).readlines()
    lines = [l.strip() for l in lines]
    return lines
Beispiel #7
0
def startRedis(dbFname):
    """ starts redis on current server as daemon.
    Creates status files with filename dbName".pid" and dbName".host". Returns the port.

    >>> import pubGeneric
    >>> pubGeneric.setupLogging(__file__, None)
    >>> h, p = startRedis("/tmp/test.tab.gz")
    >>> r = redis.Redis(port=p)
    >>> r.set("hello", "world")
    True
    >>> r.get("hello")
    'world'
    >>> r.get("world")
    >>> r.shutdown()
    """
    dbFname = abspath(dbFname)
    pidFname = dbFname + ".pid"
    port = findFreePort()
    dirName = dirname(dbFname)
    baseName = basename(dbFname) + ".rdb"

    hostFname = dbFname + ".host"
    hostname = socket.gethostbyname("localhost")
    hostDesc = hostname + ":" + str(port)
    open(hostFname, "w").write(hostDesc)
    logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname))
    maxCommon.delOnExit(hostFname)
    maxCommon.delOnExit(pidFname)
    atexit.register(shutdownRedisServers)
    global redisPorts
    redisPorts.append(port)

    cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \
        "--port", str(port), "--rdbchecksum", "no", "--dir", dirName,
        "--dbfilename", baseName, "--maxmemory", "200gb"]
    logging.info("Starting up redis server on localhost")
    maxCommon.runCommand(cmd)

    # wait until startup is complete
    redisStart = True
    while redisStart:
        try:
            r = redis.Redis(port=port)
            dbSize = r.dbsize()
            redisStart = False
        except redis.ConnectionError:
            logging.info("Waiting for 1 sec for redis startup completion")
            time.sleep(1)
            pass
    logging.info("Redis startup completed, dbSize=%d" % dbSize)

    return "localhost", port
Beispiel #8
0
def startRedis(dbFname):
    """ starts redis on current server as daemon.
    Creates status files with filename dbName".pid" and dbName".host". Returns the port.

    >>> import pubGeneric
    >>> pubGeneric.setupLogging(__file__, None)
    >>> h, p = startRedis("/tmp/test.tab.gz")
    >>> r = redis.Redis(port=p)
    >>> r.set("hello", "world")
    True
    >>> r.get("hello")
    'world'
    >>> r.get("world")
    >>> r.shutdown()
    """
    dbFname = abspath(dbFname)
    pidFname  = dbFname+".pid"
    port      = findFreePort()
    dirName   = dirname(dbFname)
    baseName  = basename(dbFname)+".rdb"

    hostFname = dbFname+".host"
    hostname  = socket.gethostbyname("localhost")
    hostDesc  = hostname+":"+str(port)
    open(hostFname, "w").write(hostDesc)
    logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname))
    maxCommon.delOnExit(hostFname)
    maxCommon.delOnExit(pidFname)
    atexit.register(shutdownRedisServers)
    global redisPorts
    redisPorts.append(port)

    cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \
        "--port", str(port), "--rdbchecksum", "no", "--dir", dirName,
        "--dbfilename", baseName, "--maxmemory", "200gb"]
    logging.info("Starting up redis server on localhost")
    maxCommon.runCommand(cmd)

    # wait until startup is complete
    redisStart = True
    while redisStart:
        try:
            r = redis.Redis(port=port)
            dbSize = r.dbsize()
            redisStart=False
        except redis.ConnectionError:
            logging.info("Waiting for 1 sec for redis startup completion")
            time.sleep(1)
            pass
    logging.info("Redis startup completed, dbSize=%d" % dbSize)

    return "localhost", port
Beispiel #9
0
def hgGetAllRows(db, tableName, tempDir):
    " return all rows of table as a list of tuples "
    query = "SELECT * from %s" % tableName
    tempFile = tempfile.NamedTemporaryFile(prefix="maxMysql_hgGetAllRows", dir=tempDir)
    cmd = 'hgsql %s -NB -e "%s" > %s' % (db, query, tempFile.name)
    maxCommon.runCommand(cmd)

    data = []
    for line in open(tempFile.name, "r"):
        row = line.strip("\n").split("\t")
        data.append(row)

    return data
Beispiel #10
0
def hgGetAllRows(db, tableName, tempDir):
    " return all rows of table as a list of tuples "
    query = "SELECT * from %s" % tableName
    tempFile = tempfile.NamedTemporaryFile(prefix="maxMysql_hgGetAllRows",
                                           dir=tempDir)
    cmd = 'hgsql %s -NB -e "%s" > %s' % (db, query, tempFile.name)
    maxCommon.runCommand(cmd)

    data = []
    for line in open(tempFile.name, "r"):
        row = line.strip("\n").split("\t")
        data.append(row)

    return data
Beispiel #11
0
def renameTables(db, fromList, toList, checkExists=False):
    " rename tables from old to new, fromToList is a list of 2-tuples "
    assert(len(fromList)==len(toList))
    logging.debug("Renaming mysql tables %s to %s" % (fromList, toList))
    parts = []
    for oldName, newName in zip(fromList, toList):
        if (not checkExists) or (checkExists and tableExists(db, oldName)):
            parts.append("%s TO %s" % (oldName, newName))
        else:
            logging.debug("Could not find table %s, %s" % (db, oldName))
    if len(parts)==0:
        logging.debug("No table found, not renaming anything")
        return
    sqlCmd = "RENAME TABLE "+", ".join(parts)

    cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #12
0
def renameTables(db, fromList, toList, checkExists=False):
    " rename tables from old to new, fromToList is a list of 2-tuples "
    assert (len(fromList) == len(toList))
    logging.debug("Renaming mysql tables %s to %s" % (fromList, toList))
    parts = []
    for oldName, newName in zip(fromList, toList):
        if (not checkExists) or (checkExists and tableExists(db, oldName)):
            parts.append("%s TO %s" % (oldName, newName))
        else:
            logging.debug("Could not find table %s, %s" % (db, oldName))
    if len(parts) == 0:
        logging.debug("No table found, not renaming anything")
        return
    sqlCmd = "RENAME TABLE " + ", ".join(parts)

    cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #13
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd = ["unzip", "-d", tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line == "":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert (len(tmpFnames) == 1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName + ".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Beispiel #14
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd =["unzip", "-d",tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line=="":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert(len(tmpFnames)==1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName+".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Beispiel #15
0
 def blatFasta(self, db, faFname, params=[]):
     """ blat fasta files against a db, create temporary write psl files
     returns a (file, filename) of temp file
     """
     seqDir = join(self.seqDir, db)
     outFnames  = defaultdict(list)
     logging.debug("Blatting %s against %s" % (faFname, seqDir))
     server, port = self.blatServers[db]
     tmpFh, tmpFname = pubGeneric.makeTempFile("blatOut.")
     cmd1 = ["gfClient", server, str(port), seqDir, faFname, "stdout", "-nohead"]
     cmd1.extend(params)
     cmd2 = ["sort", "-k10,10 "]
     cmd3 = ["pslCDnaFilter", "stdin", tmpFname,\
             "-globalNearBest=0", "-filterWeirdOverlapped", "-ignoreIntrons"]
     cmds = []
     cmds.append(" ".join(cmd1))
     cmds.append(" ".join(cmd2))
     cmds.append(" ".join(cmd3))
     cmd = "|".join(cmds)
     maxCommon.runCommand(cmd)
     return tmpFh, tmpFname
Beispiel #16
0
def zipExtract(tmpDir, zipName, filename):
    """ extract filename in zipName to tmpDir, delete tmpfile and return as string 
    thought that this was faster than python's zipfile, but it isn't
    """
    cmd = ["unzip", "-d", tmpDir, zipName, filename]
    ret = maxCommon.runCommand(cmd, ignoreErrors=True)
    if ret!=0:
        return None
    tmpFname = join(tmpDir, filename)
    data = open(tmpFname).read()
    os.remove(tmpFname)
    return data
Beispiel #17
0
def zipExtract(tmpDir, zipName, filename):
    """ extract filename in zipName to tmpDir, delete tmpfile and return as string 
    thought that this was faster than python's zipfile, but it isn't
    """
    cmd = ["unzip", "-d", tmpDir, zipName, filename]
    ret = maxCommon.runCommand(cmd, ignoreErrors=True)
    if ret != 0:
        return None
    tmpFname = join(tmpDir, filename)
    data = open(tmpFname).read()
    os.remove(tmpFname)
    return data
Beispiel #18
0
def renameTablesRegex(db, exprOrList, fromStr, toStr):
    " rename tables that match mysql expr or are given as a list from regex fromStr to toStr "
    if isinstance(exprOrList, str):
        tables = listTables(db, exprOrList)
    else:
        tables = exprOrList

    reFrom = re.compile(fromStr)
    renameDesc = []
    for oldTable in tables:
        newTable = reFrom.sub(toStr, oldTable)
        existTables = listTables(db, oldTable)
        if len(existTables) != 0:
            renameDesc.append([oldTable, newTable])
            logging.debug("Renaming table %s -> %s" % (oldTable, newTable))

    parts = []
    for oldName, newName in renameDesc:
        parts.append("%s TO %s" % (oldName, newName))
    sqlCmd = "RENAME TABLE " + ", ".join(parts)

    cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #19
0
def renameTablesRegex(db, exprOrList, fromStr, toStr):
    " rename tables that match mysql expr or are given as a list from regex fromStr to toStr "
    if isinstance(exprOrList, str):
        tables = listTables(db, exprOrList)
    else:
        tables = exprOrList

    reFrom = re.compile(fromStr)
    renameDesc = []
    for oldTable in tables:
        newTable = reFrom.sub(toStr, oldTable)
        existTables = listTables(db, oldTable)
        if len(existTables)!=0:
            renameDesc.append( [oldTable, newTable] )
            logging.debug("Renaming table %s -> %s" % (oldTable, newTable))

    parts = []
    for oldName, newName in renameDesc:
        parts.append("%s TO %s" % (oldName, newName))
    sqlCmd = "RENAME TABLE "+", ".join(parts)

    cmd = """hgsql %s -NB -e '%s'""" % (db, sqlCmd)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #20
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)
        
        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 0, x, y, md5Str, pngBlob) )

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 1, x, y, md5Str, pngBlob) )

        imgId += 1
            
    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data
Beispiel #21
0
def truncateTable(db, table):
    logging.debug("Truncating table %s" % table)
    cmd = """hgsql %s -NB -e 'truncate table %s'""" % (db, table)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #22
0
def truncateTable(db, table):
    logging.debug("Truncating table %s" % table)
    cmd = """hgsql %s -NB -e 'truncate table %s'""" % (db, table)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #23
0
def dropTable(db, table):
    logging.debug("Dropping table %s" % table)
    cmd = """hgsql %s -NB -e 'drop table if exists %s'""" % (db, table)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #24
0
def hgLoadSqlTab(db, tableName, sqlName, tabFname, optString=""):
    if isfile(tabFname):
        cmd = "hgLoadSqlTab %s %s %s %s %s" % (db, tableName, sqlName, tabFname, optString)
        maxCommon.runCommand(cmd, verbose=False)
    else:
        logging.warn("file %s not found" % tabFname)
Beispiel #25
0
def dropTable(db, table):
    logging.debug("Dropping table %s" % table)
    cmd = """hgsql %s -NB -e 'drop table if exists %s'""" % (db, table)
    maxCommon.runCommand(cmd, verbose=False)
Beispiel #26
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    loadBlacklist()

    head = open(pdfName).read(30)
    if "<html" in head or "<HTML" in head:
        logging.info("PDF %s is an HTML file, skipping" % pdfName)
        return None

    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)

        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        print "XX", md5Str, list(md5Blacklist)[:10]
        if md5Str in md5Blacklist:
            logging.debug("Image MD5 is blacklisted")
            continue

        data.append((imgId, 0, x, y, md5Str, pngBlob))

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append((imgId, 1, x, y, md5Str, pngBlob))

        imgId += 1

    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data