Example #1
0
    def allResults(self):
        """ given a list of rows with sentences as their -1 field, run these through
        the MSR pipeline 
        """
        tstart = datetime.now()
        inFh, tempFnameIn = writeMsrIn(self.rows)
        logging.info("Running MSR pipeline on %d sentences" % len(self.rows))
        #logging.info("Running MSR pipeline on %s " % sentences)
        ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt")

        cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut)
        maxCommon.runCommand(cmd)

        joinedRows = []
        logging.info("Parsing MSR output")
        for msrRow in parseMsrOut(tempFnameOut):
            textRow = list(self.rows[int(msrRow.chunkSentId)])
            textRow.extend(msrRow)
            joinedRows.append(textRow)
        inFh.close()
        ofh2.close()
        logging.debug("results " + repr(joinedRows))

        tend = datetime.now()
        secs = (tend - tstart).seconds
        logging.info("msr runtime: %d" % secs)
        return joinedRows
Example #2
0
def loadImages(con, artDict, fileDict):
    " load all relevant images from PDF into an sqlite db "
    pdfData = fileDict["content"]
    tmpFile, tmpPdfName = pubGeneric.makeTempFile("pubImgLoad", ".pdf")
    tmpFile.write(pdfData)
    tmpFile.flush()

    imgId = 0
    title = artDict.get("title", "")
    authors = artDict.get("authors", "")
    journal = artDict.get("journal", "")
    year = artDict.get("year", "")
    pmid = artDict.get("pmid", "")
    doi = artDict.get("doi", "")
    pmcId = artDict.get("pmcId", "")

    fileType = fileDict["fileType"]
    desc = fileDict["desc"]
    url = fileDict["url"]
    fileId = str(fileDict["fileId"])

    for imgId, isThumbnail, width, height, md5, pngData in getImages(tmpPdfName):
        logging.debug("Adding image %d" % imgId)
        size = len(pngData)
        fileInfo = [ title, authors, journal, year, pmid, doi, pmcId, fileType, \
                desc, url, fileId, imgId, isThumbnail, width, height, md5, size, buffer(pngData) ]
        qStr = ",".join( (["?"] * len(fileInfo)) )
        con.execute('INSERT INTO img values (%s)' % qStr, fileInfo)
        con.commit()

    tmpFile.close() # = delete
Example #3
0
    def allResults(self):
        """ given a list of rows with sentences as their -1 field, run these through
        the MSR pipeline 
        """
        tstart = datetime.now()
        inFh, tempFnameIn = writeMsrIn(self.rows)
        logging.info("Running MSR pipeline on %d sentences" % len(self.rows))
        #logging.info("Running MSR pipeline on %s " % sentences)
        ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt")

        cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut)
        maxCommon.runCommand(cmd)

        joinedRows = []
        logging.info("Parsing MSR output")
        for msrRow in parseMsrOut(tempFnameOut):
            textRow = list(self.rows[int(msrRow.chunkSentId)])
            textRow.extend(msrRow)
            joinedRows.append(textRow)
        inFh.close()
        ofh2.close()
        logging.debug("results " + repr(joinedRows))

        tend = datetime.now()
        secs = (tend-tstart).seconds
        logging.info("msr runtime: %d" % secs)
        return joinedRows
Example #4
0
def writeMsrIn(sentRows):
    """
    """
    if pubAlg.debugMode:
        tempFnameIn = "msrNlpIn.tmp.txt"
        ofh = open(tempFnameIn, "w")
    else:
        ofh, tempFnameIn = pubGeneric.makeTempFile("msrNlpIn", ".txt")
    #tempFnameIn = "crash5.xml"
    #ofh = open(tempFnameIn, "w")
    # write sentences to temp file
    logging.info("Writing input sentences to %s" % tempFnameIn)
    for i, row in enumerate(sentRows):
        text = row[-2]
        ofh.write('<txt pmid="%d">\n' % i)
        ofh.write("%s\n" % text)
        ofh.write('</txt>\n\n')
    ofh.flush()
    logging.debug("in file: %s" % open(tempFnameIn).read())
    return ofh, tempFnameIn
Example #5
0
def writeMsrIn(sentRows):
    """
    """
    if pubAlg.debugMode:
        tempFnameIn = "msrNlpIn.tmp.txt"
        ofh = open(tempFnameIn, "w")
    else:
        ofh, tempFnameIn = pubGeneric.makeTempFile("msrNlpIn", ".txt")
    #tempFnameIn = "crash5.xml"
    #ofh = open(tempFnameIn, "w")
    # write sentences to temp file
    logging.info("Writing input sentences to %s" % tempFnameIn)
    for i, row in enumerate(sentRows):
        text = row[-2]
        ofh.write('<txt pmid="%d">\n' % i)
        ofh.write("%s\n" % text)
        ofh.write('</txt>\n\n')
    ofh.flush()
    logging.debug("in file: %s" % open(tempFnameIn).read())
    return ofh, tempFnameIn
Example #6
0
 def blatFasta(self, db, faFname, params=[]):
     """ blat fasta files against a db, create temporary write psl files
     returns a (file, filename) of temp file
     """
     seqDir = join(self.seqDir, db)
     outFnames  = defaultdict(list)
     logging.debug("Blatting %s against %s" % (faFname, seqDir))
     server, port = self.blatServers[db]
     tmpFh, tmpFname = pubGeneric.makeTempFile("blatOut.")
     cmd1 = ["gfClient", server, str(port), seqDir, faFname, "stdout", "-nohead"]
     cmd1.extend(params)
     cmd2 = ["sort", "-k10,10 "]
     cmd3 = ["pslCDnaFilter", "stdin", tmpFname,\
             "-globalNearBest=0", "-filterWeirdOverlapped", "-ignoreIntrons"]
     cmds = []
     cmds.append(" ".join(cmd1))
     cmds.append(" ".join(cmd2))
     cmds.append(" ".join(cmd3))
     cmd = "|".join(cmds)
     maxCommon.runCommand(cmd)
     return tmpFh, tmpFname
Example #7
0
def loadImages(con, artDict, fileDict):
    " load all relevant images from PDF into an sqlite db "
    pdfData = fileDict["content"]
    tmpFile, tmpPdfName = pubGeneric.makeTempFile("pubImgLoad", ".pdf")
    tmpFile.write(pdfData)
    tmpFile.flush()

    imgId = 0
    title = artDict.get("title", "")
    authors = artDict.get("authors", "")
    journal = artDict.get("journal", "")
    year = artDict.get("year", "")
    pmid = artDict.get("pmid", "")
    doi = artDict.get("doi", "")
    pmcId = artDict.get("pmcId", "")

    fileType = fileDict["fileType"]
    desc = fileDict["desc"]
    url = fileDict["url"]
    fileId = str(fileDict["fileId"])

    imgRows = getImages(tmpPdfName)
    if imgRows is None:
        tmpFile.close()
        return

    for imgId, isThumbnail, width, height, md5, pngData in imgRows:
        logging.debug("Adding image %d" % imgId)
        size = len(pngData)
        fileInfo = [ title, authors, journal, year, pmid, doi, pmcId, fileType, \
                desc, url, fileId, imgId, isThumbnail, width, height, md5, size, buffer(pngData) ]
        qStr = ",".join((["?"] * len(fileInfo)))
        con.execute('INSERT INTO img values (%s)' % qStr, fileInfo)
        con.commit()

    tmpFile.close()  # = delete
Example #8
0
 def _writeSeqsToFile(self, seqIter):
     ofh, tmpName = pubGeneric.makeTempFile(prefix="findGenes", suffix=".fa")
     for seqId, seq in seqIter:
             ofh.write(">%s\n%s\n" % (seqId, seq))
     ofh.flush()
     return ofh, tmpName