def __init__(self, fileDataFilename): self.articlesWritten = 0 self.filesWritten = 0 tempDir = pubConf.getTempDir() # very convoluted way to find output filenames # needed because of parasol outDir = os.path.dirname(fileDataFilename) fileDataBasename = os.path.basename(fileDataFilename) chunkId = fileDataBasename.split(".")[0] fileBaseName = chunkId+".files" articleBaseName = chunkId+".articles" self.finalArticleName = join(outDir, articleBaseName+".gz") self.finalFileDataName = join(outDir, fileBaseName+".gz") fileFname = os.path.join(tempDir, fileBaseName) self.fileFh = codecs.open(fileFname, "w", encoding="utf8") self.fileFh.write("#"+"\t".join(fileDataFields)+"\n") articleFname = os.path.join(tempDir, articleBaseName) self.articleFh = codecs.open(articleFname, "w", encoding="utf8") self.articleFh.write("#"+"\t".join(articleFields)+"\n") self.outFilename = os.path.join(outDir, fileDataBasename)
def makeLocalTempFile(): " create tmp file on local harddisk, delete on program exit " fd, tmpOutFname = tempfile.mkstemp(dir=pubConf.getTempDir(), prefix="pubRun", suffix=".tab") os.close(fd) logging.debug("local temporary file is %s" % tmpOutFname) maxCommon.delOnExit(tmpOutFname) return tmpOutFname
def getFromCache(fname): """ Given a network path, try to find a copy of this file on the local temp disk of a cluster node. Return the path on the local disk. If there is no copy yet, copy it over and return the path. """ locCacheDir = join(pubConf.getTempDir(), "fileCache") createDirRace(locCacheDir) locPath = join(locCacheDir, basename(fname)) logging.debug("Getting a local cache path for %s" % fname) if isfile(locPath): return locPath # it doesn't exist #fobj, locTmpName = makeTempFile(prefix=basename(fname), suffix=".tmp") # copy over to local temp name, takes a while locTmpName = tempfile.mktemp(prefix=basename(fname), suffix=".tmp") time.sleep(random.randint(1, 3) + random.random()) # let's add some randomness logging.debug("Copying %s to %s" % (fname, locTmpName)) shutil.copy(fname, locTmpName) # if another process copied it over by now: roll back if isfile(locPath): #fobj.close() # = delete os.remove(locTmpName) return locPath # move on local node to final name logging.debug("Moving %s to %s" % (locTmpName, locPath)) shutil.move(locTmpName, locPath) return locPath
def getFromCache(fname): """ Given a network path, try to find a copy of this file on the local temp disk of a cluster node. Return the path on the local disk. If there is no copy yet, copy it over and return the path. """ locCacheDir = join(pubConf.getTempDir(), "fileCache") createDirRace(locCacheDir) locPath = join(locCacheDir, basename(fname)) logging.debug("Getting a local cache path for %s" % fname) if isfile(locPath): return locPath # it doesn't exist #fobj, locTmpName = makeTempFile(prefix=basename(fname), suffix=".tmp") # copy over to local temp name, takes a while locTmpName = tempfile.mktemp(prefix=basename(fname), suffix=".tmp") time.sleep(random.randint(1,3)+random.random()) # let's add some randomness logging.debug("Copying %s to %s" % (fname, locTmpName)) shutil.copy(fname, locTmpName) # if another process copied it over by now: roll back if isfile(locPath): #fobj.close() # = delete os.remove(locTmpName) return locPath # move on local node to final name logging.debug("Moving %s to %s" % (locTmpName, locPath)) shutil.move(locTmpName, locPath) return locPath
def __init__(self, fileDataFilename): self.articlesWritten = 0 self.filesWritten = 0 self.tempDir = pubConf.getTempDir() # very convoluted way to find output filenames # needed because of parasol outDir = os.path.dirname(fileDataFilename) fileDataBasename = os.path.basename(fileDataFilename) chunkId = fileDataBasename.split(".")[0] self.fileBaseName = chunkId+".files" articleBaseName = chunkId+".articles" refBaseName = chunkId+".refs" self.finalArticleName = join(outDir, articleBaseName+".gz") self.finalFileDataName = join(outDir, self.fileBaseName+".gz") # setup reference table handle self.refDir = join(outDir, "refs") self.finalRefFname = join(self.refDir, refBaseName+".gz") self.tempRefName = join(self.tempDir, refBaseName) self.refFh = None # setup file and article table handles fileFname = os.path.join(self.tempDir, self.fileBaseName) self.fileFh = codecs.open(fileFname, "w", encoding="utf8") self.fileFh.write("#"+"\t".join(fileDataFields)+"\n") maxCommon.delOnExit(fileFname) articleFname = os.path.join(self.tempDir, articleBaseName) self.articleFh = codecs.open(articleFname, "w", encoding="utf8") self.articleFh.write("#"+"\t".join(articleFields)+"\n") maxCommon.delOnExit(articleFname) self.outFilename = os.path.join(outDir, fileDataBasename)
def makeTempDir(prefix, tmpDir=None): """ create unique local temp subdir in pubtools temp dir. the pubtools temp dir is usually located on a local disk or ramdisk. """ if tmpDir==None: tmpDir=pubConf.getTempDir() dirName = tempfile.mktemp(dir=tmpDir, prefix=prefix+".") if not isdir(dirName): os.makedirs(dirName) logging.debug("Created temporary dir %s" % dirName) return dirName
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 0, x, y, md5Str, pngBlob) ) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append( (imgId, 1, x, y, md5Str, pngBlob) ) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data
def makeTempDir(prefix, tmpDir=None): """ create unique local temp subdir in pubtools temp dir. the pubtools temp dir is usually located on a local disk or ramdisk. """ if tmpDir == None: tmpDir = pubConf.getTempDir() dirName = tempfile.mktemp(dir=tmpDir, prefix=prefix + ".") if not isdir(dirName): os.makedirs(dirName) logging.debug("Created temporary dir %s" % dirName) return dirName
def makeTempFile(prefix, suffix=".psl"): """ create tempfile in pubtools tempdir dir with given prefix. Return tuple (file object , name). Tempfile will auto-delete when file object is destructed, unless debug mode is set. """ tmpDir=pubConf.getTempDir() if pubConf.debug: tfname = tempfile.mktemp(dir=tmpDir, prefix=prefix+".", suffix=suffix) #tfname = join(tmpDir, prefix+suffix) tf = open(tfname, "w") logging.debug("Created tempfile %s, debug-mode: no auto-deletion" % tfname) else: tf = tempfile.NamedTemporaryFile(dir=tmpDir, prefix=prefix+".", mode="w", suffix=suffix) return tf, tf.name
def toAscii( fileData, mimeType=None, maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, minTxtFileSize=pubConf.minTxtFileSize, ): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field hint specifies where the files come from. can be elsevier or pmc. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] if len(fileContent) > maxBinFileSize: logging.warn( "binary file size before conversion %d > %d, skipping file %s" % ( len(fileContent), maxBinFileSize, fileData["url"] + fileData["desc"] + fileData["fileId"] + fileData["articleId"], ) ) return None url = fileData["url"] fileExt = None if mimeType == None and "mimeType" in fileData and fileData["mimeType"] != None: mimeType = fileData["mimeType"] if mimeType: fileExt = pubConf.MIMEMAP.get(mimeType, None) logging.debug("File extension determined as %s" % fileExt) if fileExt == None: fileExt = os.path.splitext(url)[1].lower().strip(".") if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % (url, fileExt)) return None cmdLine = converters[fileExt] if cmdLine == "COPY": pass elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT": logging.debug("stripping XML tags") if cmdLine == "NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData == None: logging.warn("Could not convert xml to ascii") return None fileData["content"] = asciiData else: # logging.verbose("data before conversion is %s" % fileContent) asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # logging.verbose("Ascii data after conversion is %s" % asciiData) if fileExt == "pdf" and (asciiData == None or countBadChars(asciiData) >= 10): logging.debug("No data or too many non printable characters in PDF, trying alternative program") cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData == None: return None else: fileData["content"] = removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.warn("ascii file size after conversion too big, ignoring file") return None if len(fileData["content"]) < minTxtFileSize: logging.warn("ascii file size after conversion too small, ignoring file") return None # charSet = set(fileData["content"]) # if len(charSet) < 10: # logging.warn("too few characters in ASCII output: %s" % charSet) # return None return fileData
def toAscii(fileData, mimeType=None, \ maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \ minTxtFileSize=pubConf.minTxtFileSize): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] fileSize = len(fileContent) if "locFname" in fileData: locFname=fileData["locFname"] fileDebugDesc = fileData["externalId"]+":"+locFname else: locFname = None fileDebugDesc = ",".join([fileData["url"],fileData["desc"], fileData["fileId"],fileData["articleId"]]) if fileSize > maxBinFileSize: logging.warn("binary file size before conversion %d > %d, skipping file %s" % \ (len(fileContent), maxBinFileSize, fileDebugDesc)) return None fileExt = getFileExt(fileData, locFname, mimeType) if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % \ (fileDebugDesc, fileExt)) return None cmdLine = converters[fileExt] if cmdLine=="COPY": # fileData["content"] already contains ASCII text pass elif cmdLine=="XMLTEXT" or cmdLine=="NXMLTEXT": logging.debug("stripping XML tags") if cmdLine=="NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData==None: logging.debug("Could not convert xml to ascii, file %s" % fileData["url"]) return None fileData["content"]=asciiData else: asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # try to detect corrupted pdf2text output and run second converter if fileExt=="pdf" and \ ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10): logging.debug("No data or too many non printable characters in PDF, trying alternative program") cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData==None: logging.info("conversion failed for %s" % fileDebugDesc) return None else: fileData["content"]=removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.info("ascii file size after conversion too big, ignoring file %s" % fileDebugDesc) return None if len(fileData["content"]) < minTxtFileSize: logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \ (len(fileData["content"]), minTxtFileSize, fileDebugDesc)) return None #charSet = set(fileData["content"]) #if len(charSet) < 10: #logging.warn("too few characters in ASCII output: %s" % charSet) #return None return fileData
def toAscii(fileData, mimeType=None, \ maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \ minTxtFileSize=pubConf.minTxtFileSize): """ pick out the content from the fileData dictionary, write it to a local file in tempDir and convert it to ASCII format. Put output back into the content field. mimeType will be used if specified, otherwise try to guess converter based on url file extension returns fileData if successful, otherwise None returns only unicode strings (despite the name) """ converters = pubConf.getConverters() tempDir = pubConf.getTempDir() fileContent = fileData["content"] fileSize = len(fileContent) if "locFname" in fileData: locFname = fileData["locFname"] fileDebugDesc = fileData["externalId"] + ":" + locFname else: locFname = None fileDebugDesc = ",".join([ fileData["url"], fileData["desc"], fileData["fileId"], fileData["articleId"] ]) if fileSize > maxBinFileSize: logging.warn("binary file size before conversion %d > %d, skipping file %s" % \ (len(fileContent), maxBinFileSize, fileDebugDesc)) return None fileExt = getFileExt(fileData, locFname, mimeType) if fileExt not in converters: logging.debug("Could not convert file %s, no converter for extension %s" % \ (fileDebugDesc, fileExt)) return None cmdLine = converters[fileExt] if cmdLine == "COPY": # fileData["content"] already contains ASCII text pass elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT": logging.debug("stripping XML tags") if cmdLine == "NXMLTEXT": asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True) else: asciiData = pubXml.stripXmlTags(fileContent) if asciiData == None: logging.debug("Could not convert xml to ascii, file %s" % fileData["url"]) return None fileData["content"] = asciiData else: asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) # try to detect corrupted pdf2text output and run second converter if fileExt=="pdf" and \ ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10): logging.debug( "No data or too many non printable characters in PDF, trying alternative program" ) cmdLine = converters["pdf2"] asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir) if asciiData == None: logging.info("conversion failed for %s" % fileDebugDesc) return None else: fileData["content"] = removeBadChars(asciiData) fileData = dictToUnicode(fileData) if len(fileData["content"]) > maxTxtFileSize: logging.info( "ascii file size after conversion too big, ignoring file %s" % fileDebugDesc) return None if len(fileData["content"]) < minTxtFileSize: logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \ (len(fileData["content"]), minTxtFileSize, fileDebugDesc)) return None #charSet = set(fileData["content"]) #if len(charSet) < 10: #logging.warn("too few characters in ASCII output: %s" % charSet) #return None return fileData
def getImages(pdfName): """ returns a list of tuples (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName. returns two tuples per image, one is the original, one is the thumbnail. """ loadBlacklist() head = open(pdfName).read(30) if "<html" in head or "<HTML" in head: logging.info("PDF %s is an HTML file, skipping" % pdfName) return None logging.debug("Extracting images from %s" % pdfName) tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir()) maxCommon.delOnExit(tempDir) outStem = join(tempDir, "img") cmd = "pdfimages %s %s" % (pdfName, outStem) maxCommon.runCommand(cmd) # convert to png data = [] imgId = 0 for fname in glob.glob(join(tempDir, "*.ppm")): logging.debug("got image %s" % fname) x, y = pbmSize(open(fname)) if not looksInteresting(x, y): logging.debug("Image is too small or too long/wide") continue logging.debug("Loading image into sqlite") outFname = "%s.png" % fname cmd = "convert %s %s" % (fname, outFname) maxCommon.runCommand(cmd) pngBlob = open(outFname).read() md5Str = makeMd5(pngBlob) print "XX", md5Str, list(md5Blacklist)[:10] if md5Str in md5Blacklist: logging.debug("Image MD5 is blacklisted") continue data.append((imgId, 0, x, y, md5Str, pngBlob)) # make the thumbnail thumbFName = "%s.thumb.png" % fname # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/ # but can't use -posterize 136 on centos6 cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \ "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \ "-define png:compression-level=9 -define png:compression-strategy=1 " \ "-define png:exclude-chunk=all -interlace none -colorspace " \ "sRGB -strip %s %s" % (WIDTH, fname, thumbFName) maxCommon.runCommand(cmd) x, y = pngDimensions(thumbFName) pngBlob = open(thumbFName).read() md5Str = makeMd5(pngBlob) data.append((imgId, 1, x, y, md5Str, pngBlob)) imgId += 1 shutil.rmtree(tempDir) maxCommon.ignoreOnExit(tempDir) return data