def makeLocalTempFile(): " create tmp file on local harddisk, delete on program exit " fd, tmpOutFname = tempfile.mkstemp(dir=pubConf.getTempDir(), prefix="pubRun", suffix=".tab") os.close(fd) logging.debug("local temporary file is %s" % tmpOutFname) maxCommon.delOnExit(tmpOutFname) return tmpOutFname
def __init__(self, fileDataFilename): self.articlesWritten = 0 self.filesWritten = 0 self.tempDir = pubConf.getTempDir() # very convoluted way to find output filenames # needed because of parasol outDir = os.path.dirname(fileDataFilename) fileDataBasename = os.path.basename(fileDataFilename) chunkId = fileDataBasename.split(".")[0] self.fileBaseName = chunkId+".files" articleBaseName = chunkId+".articles" refBaseName = chunkId+".refs" self.finalArticleName = join(outDir, articleBaseName+".gz") self.finalFileDataName = join(outDir, self.fileBaseName+".gz") # setup reference table handle self.refDir = join(outDir, "refs") self.finalRefFname = join(self.refDir, refBaseName+".gz") self.tempRefName = join(self.tempDir, refBaseName) self.refFh = None # setup file and article table handles fileFname = os.path.join(self.tempDir, self.fileBaseName) self.fileFh = codecs.open(fileFname, "w", encoding="utf8") self.fileFh.write("#"+"\t".join(fileDataFields)+"\n") maxCommon.delOnExit(fileFname) articleFname = os.path.join(self.tempDir, articleBaseName) self.articleFh = codecs.open(articleFname, "w", encoding="utf8") self.articleFh.write("#"+"\t".join(articleFields)+"\n") maxCommon.delOnExit(articleFname) self.outFilename = os.path.join(outDir, fileDataBasename)
def runConverter(cmdLine, fileContent, fileExt, tempDir): """ create local temp for in and output files, write data to infile, run command. file can be supplied as a str in fileContent["content"] or alternatively as a pathname via 'locFname' """ # create temp file fd, inFname = tempfile.mkstemp(suffix="."+fileExt, dir=tempDir, prefix="pubConvPmc.in.") logging.debug("Created %s" % inFname) maxCommon.delOnExit(inFname) inFile = os.fdopen(fd, "wb") inFile.write(fileContent) inFile.close() logging.debug("Created in temp file %s" % (inFname)) fd, outFname = tempfile.mkstemp(suffix=".txt", dir=tempDir, prefix="pubConvPmc.out.") maxCommon.delOnExit(outFname) os.close(fd) logging.debug("Created out temp file %s" % (outFname)) # allow %(name)s syntax in cmdLine string to use variables from pubConf cmdLine = cmdLine % pubConf.__dict__ # build cmd line and run cmdLine = cmdLine.replace("$in", inFname) cmdLine = cmdLine.replace("$out", outFname) logging.debug("running "+cmdLine) skipFile=False stdout, stderr, ret = runCommandTimeout(cmdLine, bufSize=10000000, timeout=30) asciiData = None if ret==2 and "docx2txt" not in cmdLine: # docx2text returns exit code 2 in some cases logging.error("stopped on errno 2: looks like you pressed ctrl-c") sys.exit(2) if ret!=0: logging.error("error %d occured while executing %s" % (ret, cmdLine)) printOut(stdout, stderr) skipFile=True if os.path.getsize(outFname)==0: logging.error("zero file size of output file after command %s" % (cmdLine)) printOut(stdout, stderr) skipFile=True if not skipFile: asciiData = open(outFname).read() logging.debug("Removing %s" % inFname) os.remove(inFname) os.remove(outFname) asciiData = forceToUnicode(asciiData) if skipFile: return None else: return asciiData
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 0, x, y, md5Str, pngBlob) ) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 1, x, y, md5Str, pngBlob) ) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data
def startRedis(dbFname): """ starts redis on current server as daemon. Creates status files with filename dbName".pid" and dbName".host". Returns the port. >>> import pubGeneric >>> pubGeneric.setupLogging(__file__, None) >>> h, p = startRedis("/tmp/test.tab.gz") >>> r = redis.Redis(port=p) >>> r.set("hello", "world") True >>> r.get("hello") 'world' >>> r.get("world") >>> r.shutdown() """ dbFname = abspath(dbFname) pidFname = dbFname + ".pid" port = findFreePort() dirName = dirname(dbFname) baseName = basename(dbFname) + ".rdb" hostFname = dbFname + ".host" hostname = socket.gethostbyname("localhost") hostDesc = hostname + ":" + str(port) open(hostFname, "w").write(hostDesc) logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname)) maxCommon.delOnExit(hostFname) maxCommon.delOnExit(pidFname) atexit.register(shutdownRedisServers) global redisPorts redisPorts.append(port) cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \ "--port", str(port), "--rdbchecksum", "no", "--dir", dirName, "--dbfilename", baseName, "--maxmemory", "200gb"] logging.info("Starting up redis server on localhost") maxCommon.runCommand(cmd) # wait until startup is complete redisStart = True while redisStart: try: r = redis.Redis(port=port) dbSize = r.dbsize() redisStart = False except redis.ConnectionError: logging.info("Waiting for 1 sec for redis startup completion") time.sleep(1) pass logging.info("Redis startup completed, dbSize=%d" % dbSize) return "localhost", port
def startRedis(dbFname): """ starts redis on current server as daemon. Creates status files with filename dbName".pid" and dbName".host". Returns the port. >>> import pubGeneric >>> pubGeneric.setupLogging(__file__, None) >>> h, p = startRedis("/tmp/test.tab.gz") >>> r = redis.Redis(port=p) >>> r.set("hello", "world") True >>> r.get("hello") 'world' >>> r.get("world") >>> r.shutdown() """ dbFname = abspath(dbFname) pidFname = dbFname+".pid" port = findFreePort() dirName = dirname(dbFname) baseName = basename(dbFname)+".rdb" hostFname = dbFname+".host" hostname = socket.gethostbyname("localhost") hostDesc = hostname+":"+str(port) open(hostFname, "w").write(hostDesc) logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname)) maxCommon.delOnExit(hostFname) maxCommon.delOnExit(pidFname) atexit.register(shutdownRedisServers) global redisPorts redisPorts.append(port) cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \ "--port", str(port), "--rdbchecksum", "no", "--dir", dirName, "--dbfilename", baseName, "--maxmemory", "200gb"] logging.info("Starting up redis server on localhost") maxCommon.runCommand(cmd) # wait until startup is complete redisStart = True while redisStart: try: r = redis.Redis(port=port) dbSize = r.dbsize() redisStart=False except redis.ConnectionError: logging.info("Waiting for 1 sec for redis startup completion") time.sleep(1) pass logging.info("Redis startup completed, dbSize=%d" % dbSize) return "localhost", port
def __init__(self, fname, singleProcess=False, newDb=False, tmpDir=None, onlyKey=False, compress=False, keyIsInt=False, eightBit=False, onlyUnique=False): self.onlyUnique = onlyUnique self.compress = compress self.batchMaxSize = 100000 self.batch = [] self.finalDbName = None self.onlyKey = onlyKey self.dbName = "%s.sqlite" % fname if newDb and isfile(self.dbName): os.remove(self.dbName) isolLevel = None self.singleProcess = singleProcess if singleProcess: isolLevel = "exclusive" self.con = None if not os.path.isfile(self.dbName) and tmpDir!=None: # create a new temp db on ramdisk self.finalDbName = self.dbName #self.dbName = join(pubConf.getFastTempDir(), basename(self.dbName)) self.dbName = join(tmpDir, basename(self.dbName)) logging.debug("Creating new temp db on ramdisk %s" % self.dbName) if isfile(self.dbName): os.remove(self.dbName) maxCommon.delOnExit(self.dbName) # make sure this is deleted on exit try: self.con = sqlite3.connect(self.dbName) except sqlite3.OperationalError: logging.error("Could not open %s" % self.dbName) raise logging.debug("Opening sqlite DB %s" % self.dbName) keyType = "TEXT" if keyIsInt: keyType = "INT" if onlyKey: self.con.execute("CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY)" % keyType) else: self.con.execute("CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY,value BLOB)" % keyType) self.con.commit() self.cur = self.con if singleProcess: self.cur.execute("PRAGMA synchronous=OFF") # recommended by self.cur.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ self.cur.execute("PRAGMA cache_size=8000000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html self.cur.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode self.cur.execute("PRAGMA temp_store=memory") self.con.commit() if eightBit: self.con.text_factory = str
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd =["unzip", "-d",tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line=="": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert(len(tmpFnames)==1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName+".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def indexTsv(zipFname, tsvName, outFname): """ unzip a zipfile, recompress all the tsvs inside with gzip and create an .index.gz for them""" #def indexTsv(zipFname, tsvName, outFname, bgzipPath): # extract to local disk tmpDir = pubGeneric.makeTempDir("bingData") maxCommon.delOnExit(tmpDir) logging.info("Extracting to %s" % tmpDir) cmd = ["unzip", "-d", tmpDir, zipFname] maxCommon.runCommand(cmd) tempFname = join(tmpDir, tsvName) logging.info("Indexing %s to %s" % (tempFname, outFname)) # index lines ofh = gzip.open(outFname, "w") ifh = open(tempFname, "rb") offset = 0 # the file iterator does not work with tell()!! #for line in ifh: while True: line = ifh.readline() if line == "": break url = line[0:line.find("\t")] ofh.write("%s\t%d\n" % (url, offset)) #logging.debug("url %s, offset %d" % (url, offset)) offset = ifh.tell() ofh.close() # re-compress with gzip tmpFnames = glob.glob(join(tmpDir, "*.tsv")) assert (len(tmpFnames) == 1) tmpFname = tmpFnames[0] zipDir = dirname(zipFname) finalFname = join(zipDir, tsvName + ".gz") logging.info("Compressing to %s" % finalFname) #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname) cmd = "gzip %s -c > %s" % (tmpFname, finalFname) maxCommon.runCommand(cmd) shutil.rmtree(tmpDir)
def writeRefs(self, artDict, refRows): " write references to table in refs/ subdir" if self.refFh==None: # lazily open ref file, add headers if not os.path.isdir(self.refDir): try: os.makedirs(self.refDir) except OSError: logging.info("makedir %s failed, probably just race condition" % self.refDir) pass #self.refFh = tempfile.NamedTemporaryFile(dir=self.tempDir, suffix=".gz", prefix=self.fileBaseName+".") self.refFh = open(self.tempRefName, "w") logging.info("Created tempfile for refs %s" % self.refFh.name) maxCommon.delOnExit(self.tempRefName) refHeaders = copy.copy(refArtFields) refHeaders.extend(refFields) self.refFh.write("#"+"\t".join(refHeaders)+"\n") # prepare a list of article IDs of the source article srcArtFields = [] for artField in refArtFields: if artField=="artDoi": artField="doi" if artField=="artPmid": artField="pmid" artVal = artDict[artField] srcArtFields.append(artVal) srcPrefix = "\t".join(srcArtFields)+"\t" # output all references logging.debug("Writing %d references for article %s" % (len(refRows), artDict["externalId"])) for ref in refRows: # output the source article IDs self.refFh.write(srcPrefix.encode("utf8")) # output the reference article fields self.refFh.write(u'\t'.join(ref).encode("utf8")) self.refFh.write("\n")
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ loadBlacklist() head = open(pdfName).read(30) if "<html" in head or "<HTML" in head: logging.info("PDF %s is an HTML file, skipping" % pdfName) return None logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) print "XX", md5Str, list(md5Blacklist)[:10] if md5Str in md5Blacklist: logging.debug("Image MD5 is blacklisted") continue data.append((imgId, 0, x, y, md5Str, pngBlob)) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append((imgId, 1, x, y, md5Str, pngBlob)) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data
def getFastUniqueTempFname(): " create unique tempdir on ramdisk, delete on exit " tempFname = tempfile.mktemp(dir=pubConf.getFastTempDir()) maxCommon.delOnExit(tempFname) return tempFname
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u'\xbf' in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename=="": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename+":"+filename if pdfString==None: pdfNotFound+=1 logging.error("Could not open pdf or xml file") continue articleId=int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"]=zipFilename+"/"+filename articleData["externalId"]=articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u"\xbf" in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename == "": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename + ":" + filename if pdfString == None: pdfNotFound += 1 logging.error("Could not open pdf or xml file") continue articleId = int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"] = zipFilename + "/" + filename articleData["externalId"] = articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def __init__(self, fname, singleProcess=False, newDb=False, tmpDir=None, onlyKey=False, compress=False, keyIsInt=False, eightBit=False, onlyUnique=False): self.onlyUnique = onlyUnique self.compress = compress self.batchMaxSize = 100000 self.batch = [] self.finalDbName = None self.onlyKey = onlyKey self.dbName = "%s.sqlite" % fname if newDb and isfile(self.dbName): os.remove(self.dbName) isolLevel = None self.singleProcess = singleProcess if singleProcess: isolLevel = "exclusive" self.con = None if not os.path.isfile(self.dbName) and tmpDir != None: # create a new temp db on ramdisk self.finalDbName = self.dbName #self.dbName = join(pubConf.getFastTempDir(), basename(self.dbName)) self.dbName = join(tmpDir, basename(self.dbName)) logging.debug("Creating new temp db on ramdisk %s" % self.dbName) if isfile(self.dbName): os.remove(self.dbName) maxCommon.delOnExit( self.dbName) # make sure this is deleted on exit try: self.con = sqlite3.connect(self.dbName) except sqlite3.OperationalError: logging.error("Could not open %s" % self.dbName) raise logging.debug("Opening sqlite DB %s" % self.dbName) keyType = "TEXT" if keyIsInt: keyType = "INT" if onlyKey: self.con.execute( "CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY)" % keyType) else: self.con.execute( "CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY,value BLOB)" % keyType) self.con.commit() self.cur = self.con if singleProcess: self.cur.execute("PRAGMA synchronous=OFF") # recommended by self.cur.execute( "PRAGMA count_changes=OFF" ) # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/ self.cur.execute( "PRAGMA cache_size=8000000" ) # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html self.cur.execute( "PRAGMA journal_mode=OFF" ) # http://www.sqlite.org/pragma.html#pragma_journal_mode self.cur.execute("PRAGMA temp_store=memory") self.con.commit() if eightBit: self.con.text_factory = str