def allResults(self): """ given a list of rows with sentences as their -1 field, run these through the MSR pipeline """ tstart = datetime.now() inFh, tempFnameIn = writeMsrIn(self.rows) logging.info("Running MSR pipeline on %d sentences" % len(self.rows)) #logging.info("Running MSR pipeline on %s " % sentences) ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt") cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut) maxCommon.runCommand(cmd) joinedRows = [] logging.info("Parsing MSR output") for msrRow in parseMsrOut(tempFnameOut): textRow = list(self.rows[int(msrRow.chunkSentId)]) textRow.extend(msrRow) joinedRows.append(textRow) inFh.close() ofh2.close() logging.debug("results " + repr(joinedRows)) tend = datetime.now() secs = (tend - tstart).seconds logging.info("msr runtime: %d" % secs) return joinedRows
def loadImages(con, artDict, fileDict): " load all relevant images from PDF into an sqlite db " pdfData = fileDict["content"] tmpFile, tmpPdfName = pubGeneric.makeTempFile("pubImgLoad", ".pdf") tmpFile.write(pdfData) tmpFile.flush() imgId = 0 title = artDict.get("title", "") authors = artDict.get("authors", "") journal = artDict.get("journal", "") year = artDict.get("year", "") pmid = artDict.get("pmid", "") doi = artDict.get("doi", "") pmcId = artDict.get("pmcId", "") fileType = fileDict["fileType"] desc = fileDict["desc"] url = fileDict["url"] fileId = str(fileDict["fileId"]) for imgId, isThumbnail, width, height, md5, pngData in getImages(tmpPdfName): logging.debug("Adding image %d" % imgId) size = len(pngData) fileInfo = [ title, authors, journal, year, pmid, doi, pmcId, fileType, \ desc, url, fileId, imgId, isThumbnail, width, height, md5, size, buffer(pngData) ] qStr = ",".join( (["?"] * len(fileInfo)) ) con.execute('INSERT INTO img values (%s)' % qStr, fileInfo) con.commit() tmpFile.close() # = delete
def allResults(self): """ given a list of rows with sentences as their -1 field, run these through the MSR pipeline """ tstart = datetime.now() inFh, tempFnameIn = writeMsrIn(self.rows) logging.info("Running MSR pipeline on %d sentences" % len(self.rows)) #logging.info("Running MSR pipeline on %s " % sentences) ofh2, tempFnameOut = pubGeneric.makeTempFile("msrNlpOut", ".txt") cmd = "%s/runMsr.sh %s %s" % (msrDir, tempFnameIn, tempFnameOut) maxCommon.runCommand(cmd) joinedRows = [] logging.info("Parsing MSR output") for msrRow in parseMsrOut(tempFnameOut): textRow = list(self.rows[int(msrRow.chunkSentId)]) textRow.extend(msrRow) joinedRows.append(textRow) inFh.close() ofh2.close() logging.debug("results " + repr(joinedRows)) tend = datetime.now() secs = (tend-tstart).seconds logging.info("msr runtime: %d" % secs) return joinedRows
def writeMsrIn(sentRows): """ """ if pubAlg.debugMode: tempFnameIn = "msrNlpIn.tmp.txt" ofh = open(tempFnameIn, "w") else: ofh, tempFnameIn = pubGeneric.makeTempFile("msrNlpIn", ".txt") #tempFnameIn = "crash5.xml" #ofh = open(tempFnameIn, "w") # write sentences to temp file logging.info("Writing input sentences to %s" % tempFnameIn) for i, row in enumerate(sentRows): text = row[-2] ofh.write('<txt pmid="%d">\n' % i) ofh.write("%s\n" % text) ofh.write('</txt>\n\n') ofh.flush() logging.debug("in file: %s" % open(tempFnameIn).read()) return ofh, tempFnameIn
def blatFasta(self, db, faFname, params=[]): """ blat fasta files against a db, create temporary write psl files returns a (file, filename) of temp file """ seqDir = join(self.seqDir, db) outFnames = defaultdict(list) logging.debug("Blatting %s against %s" % (faFname, seqDir)) server, port = self.blatServers[db] tmpFh, tmpFname = pubGeneric.makeTempFile("blatOut.") cmd1 = ["gfClient", server, str(port), seqDir, faFname, "stdout", "-nohead"] cmd1.extend(params) cmd2 = ["sort", "-k10,10 "] cmd3 = ["pslCDnaFilter", "stdin", tmpFname,\ "-globalNearBest=0", "-filterWeirdOverlapped", "-ignoreIntrons"] cmds = [] cmds.append(" ".join(cmd1)) cmds.append(" ".join(cmd2)) cmds.append(" ".join(cmd3)) cmd = "|".join(cmds) maxCommon.runCommand(cmd) return tmpFh, tmpFname
def loadImages(con, artDict, fileDict): " load all relevant images from PDF into an sqlite db " pdfData = fileDict["content"] tmpFile, tmpPdfName = pubGeneric.makeTempFile("pubImgLoad", ".pdf") tmpFile.write(pdfData) tmpFile.flush() imgId = 0 title = artDict.get("title", "") authors = artDict.get("authors", "") journal = artDict.get("journal", "") year = artDict.get("year", "") pmid = artDict.get("pmid", "") doi = artDict.get("doi", "") pmcId = artDict.get("pmcId", "") fileType = fileDict["fileType"] desc = fileDict["desc"] url = fileDict["url"] fileId = str(fileDict["fileId"]) imgRows = getImages(tmpPdfName) if imgRows is None: tmpFile.close() return for imgId, isThumbnail, width, height, md5, pngData in imgRows: logging.debug("Adding image %d" % imgId) size = len(pngData) fileInfo = [ title, authors, journal, year, pmid, doi, pmcId, fileType, \ desc, url, fileId, imgId, isThumbnail, width, height, md5, size, buffer(pngData) ] qStr = ",".join((["?"] * len(fileInfo))) con.execute('INSERT INTO img values (%s)' % qStr, fileInfo) con.commit() tmpFile.close() # = delete
def _writeSeqsToFile(self, seqIter): ofh, tmpName = pubGeneric.makeTempFile(prefix="findGenes", suffix=".fa") for seqId, seq in seqIter: ofh.write(">%s\n%s\n" % (seqId, seq)) ofh.flush() return ofh, tmpName