Example #1
0
    def __init__(self, fileDataFilename):
        self.articlesWritten = 0
        self.filesWritten = 0
        tempDir = pubConf.getTempDir()

        # very convoluted way to find output filenames
        # needed because of parasol 
        outDir = os.path.dirname(fileDataFilename)
        fileDataBasename = os.path.basename(fileDataFilename)
        chunkId = fileDataBasename.split(".")[0]
        fileBaseName = chunkId+".files"
        articleBaseName = chunkId+".articles"

        self.finalArticleName = join(outDir, articleBaseName+".gz")
        self.finalFileDataName    = join(outDir, fileBaseName+".gz")

        fileFname = os.path.join(tempDir, fileBaseName)
        self.fileFh = codecs.open(fileFname, "w", encoding="utf8")
        self.fileFh.write("#"+"\t".join(fileDataFields)+"\n")

        articleFname = os.path.join(tempDir, articleBaseName)
        self.articleFh = codecs.open(articleFname, "w", encoding="utf8") 
        self.articleFh.write("#"+"\t".join(articleFields)+"\n")

        self.outFilename = os.path.join(outDir, fileDataBasename)
Example #2
0
def makeLocalTempFile():
    " create tmp file on local harddisk, delete on program exit "
    fd, tmpOutFname = tempfile.mkstemp(dir=pubConf.getTempDir(), prefix="pubRun", suffix=".tab")
    os.close(fd)
    logging.debug("local temporary file is %s" % tmpOutFname)
    maxCommon.delOnExit(tmpOutFname)
    return tmpOutFname
Example #3
0
def getFromCache(fname):
    """ Given a network path, try to find a copy of this file on the local temp disk of a cluster node.
    Return the path on the local disk. If there is no copy yet, copy it over and return the path.
    """
    locCacheDir = join(pubConf.getTempDir(), "fileCache")
    createDirRace(locCacheDir)
    locPath = join(locCacheDir, basename(fname))
    logging.debug("Getting a local cache path for %s" % fname)
    if isfile(locPath):
        return locPath
    # it doesn't exist
    #fobj, locTmpName = makeTempFile(prefix=basename(fname), suffix=".tmp")
    # copy over to local temp name, takes a while
    locTmpName = tempfile.mktemp(prefix=basename(fname), suffix=".tmp")
    time.sleep(random.randint(1, 3) +
               random.random())  # let's add some randomness
    logging.debug("Copying %s to %s" % (fname, locTmpName))
    shutil.copy(fname, locTmpName)
    # if another process copied it over by now: roll back
    if isfile(locPath):
        #fobj.close() # = delete
        os.remove(locTmpName)
        return locPath
    # move on local node to final name
    logging.debug("Moving %s to %s" % (locTmpName, locPath))
    shutil.move(locTmpName, locPath)
    return locPath
Example #4
0
def getFromCache(fname):
    """ Given a network path, try to find a copy of this file on the local temp disk of a cluster node.
    Return the path on the local disk. If there is no copy yet, copy it over and return the path.
    """
    locCacheDir = join(pubConf.getTempDir(), "fileCache")
    createDirRace(locCacheDir)
    locPath = join(locCacheDir, basename(fname))
    logging.debug("Getting a local cache path for %s" % fname)
    if isfile(locPath):
        return locPath
    # it doesn't exist
    #fobj, locTmpName = makeTempFile(prefix=basename(fname), suffix=".tmp")
    # copy over to local temp name, takes a while
    locTmpName = tempfile.mktemp(prefix=basename(fname), suffix=".tmp")
    time.sleep(random.randint(1,3)+random.random()) # let's add some randomness
    logging.debug("Copying %s to %s" % (fname, locTmpName))
    shutil.copy(fname, locTmpName)
    # if another process copied it over by now: roll back
    if isfile(locPath):
        #fobj.close() # = delete
        os.remove(locTmpName)
        return locPath
    # move on local node to final name
    logging.debug("Moving %s to %s" % (locTmpName, locPath))
    shutil.move(locTmpName, locPath)
    return locPath
Example #5
0
    def __init__(self, fileDataFilename):
        self.articlesWritten = 0
        self.filesWritten = 0
        self.tempDir = pubConf.getTempDir()

        # very convoluted way to find output filenames
        # needed because of parasol 
        outDir = os.path.dirname(fileDataFilename)
        fileDataBasename = os.path.basename(fileDataFilename)
        chunkId = fileDataBasename.split(".")[0]
        self.fileBaseName = chunkId+".files"
        articleBaseName = chunkId+".articles"
        refBaseName = chunkId+".refs"

        self.finalArticleName = join(outDir, articleBaseName+".gz")
        self.finalFileDataName    = join(outDir, self.fileBaseName+".gz")

        # setup reference table handle
        self.refDir = join(outDir, "refs")
        self.finalRefFname = join(self.refDir, refBaseName+".gz")
        self.tempRefName = join(self.tempDir, refBaseName)
        self.refFh = None

        # setup file and article table handles
        fileFname = os.path.join(self.tempDir, self.fileBaseName)
        self.fileFh = codecs.open(fileFname, "w", encoding="utf8")
        self.fileFh.write("#"+"\t".join(fileDataFields)+"\n")
        maxCommon.delOnExit(fileFname)

        articleFname = os.path.join(self.tempDir, articleBaseName)
        self.articleFh = codecs.open(articleFname, "w", encoding="utf8") 
        self.articleFh.write("#"+"\t".join(articleFields)+"\n")
        maxCommon.delOnExit(articleFname)

        self.outFilename = os.path.join(outDir, fileDataBasename)
Example #6
0
def makeTempDir(prefix, tmpDir=None):
    """ create unique local temp subdir in pubtools temp dir.
        the pubtools temp dir is usually located on a local disk or ramdisk.
    """
    if tmpDir==None:
        tmpDir=pubConf.getTempDir()
    dirName = tempfile.mktemp(dir=tmpDir, prefix=prefix+".")
    if not isdir(dirName):
        os.makedirs(dirName)
    logging.debug("Created temporary dir %s" % dirName)
    return dirName
Example #7
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)
        
        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 0, x, y, md5Str, pngBlob) )

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 1, x, y, md5Str, pngBlob) )

        imgId += 1
            
    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data
Example #8
0
def makeTempDir(prefix, tmpDir=None):
    """ create unique local temp subdir in pubtools temp dir.
        the pubtools temp dir is usually located on a local disk or ramdisk.
    """
    if tmpDir == None:
        tmpDir = pubConf.getTempDir()
    dirName = tempfile.mktemp(dir=tmpDir, prefix=prefix + ".")
    if not isdir(dirName):
        os.makedirs(dirName)
    logging.debug("Created temporary dir %s" % dirName)
    return dirName
Example #9
0
def makeTempFile(prefix, suffix=".psl"):
    """ create tempfile in pubtools tempdir dir with given prefix. 
    Return tuple (file object , name).
    Tempfile will auto-delete when file object is destructed, unless debug mode is set. 
    """
    tmpDir=pubConf.getTempDir()
    if pubConf.debug:
        tfname = tempfile.mktemp(dir=tmpDir, prefix=prefix+".", suffix=suffix)
        #tfname = join(tmpDir, prefix+suffix)
        tf = open(tfname, "w")
        logging.debug("Created tempfile %s, debug-mode: no auto-deletion" % tfname)
    else:
        tf = tempfile.NamedTemporaryFile(dir=tmpDir, prefix=prefix+".", mode="w", suffix=suffix)
    return tf, tf.name
Example #10
0
def makeTempFile(prefix, suffix=".psl"):
    """ create tempfile in pubtools tempdir dir with given prefix. 
    Return tuple (file object , name).
    Tempfile will auto-delete when file object is destructed, unless debug mode is set. 
    """
    tmpDir=pubConf.getTempDir()
    if pubConf.debug:
        tfname = tempfile.mktemp(dir=tmpDir, prefix=prefix+".", suffix=suffix)
        #tfname = join(tmpDir, prefix+suffix)
        tf = open(tfname, "w")
        logging.debug("Created tempfile %s, debug-mode: no auto-deletion" % tfname)
    else:
        tf = tempfile.NamedTemporaryFile(dir=tmpDir, prefix=prefix+".", mode="w", suffix=suffix)
    return tf, tf.name
Example #11
0
def toAscii(
    fileData,
    mimeType=None,
    maxBinFileSize=pubConf.maxBinFileSize,
    maxTxtFileSize=pubConf.maxTxtFileSize,
    minTxtFileSize=pubConf.minTxtFileSize,
):
    """ pick out the content from the fileData dictionary, 
    write it to a local file in tempDir and convert it to 
    ASCII format. Put output back into the content field 

    hint specifies where the files come from. can be elsevier or pmc.
    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    if len(fileContent) > maxBinFileSize:
        logging.warn(
            "binary file size before conversion %d > %d, skipping file %s"
            % (
                len(fileContent),
                maxBinFileSize,
                fileData["url"] + fileData["desc"] + fileData["fileId"] + fileData["articleId"],
            )
        )
        return None

    url = fileData["url"]

    fileExt = None
    if mimeType == None and "mimeType" in fileData and fileData["mimeType"] != None:
        mimeType = fileData["mimeType"]

    if mimeType:
        fileExt = pubConf.MIMEMAP.get(mimeType, None)
        logging.debug("File extension determined as %s" % fileExt)
    if fileExt == None:
        fileExt = os.path.splitext(url)[1].lower().strip(".")

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % (url, fileExt))
        return None

    cmdLine = converters[fileExt]

    if cmdLine == "COPY":
        pass

    elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine == "NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData == None:
            logging.warn("Could not convert xml to ascii")
            return None
        fileData["content"] = asciiData
    else:
        # logging.verbose("data before conversion is %s" % fileContent)
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)
        # logging.verbose("Ascii data after conversion is %s" % asciiData)
        if fileExt == "pdf" and (asciiData == None or countBadChars(asciiData) >= 10):
            logging.debug("No data or too many non printable characters in PDF, trying alternative program")
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData == None:
            return None
        else:
            fileData["content"] = removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.warn("ascii file size after conversion too big, ignoring file")
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.warn("ascii file size after conversion too small, ignoring file")
        return None

    # charSet = set(fileData["content"])
    # if len(charSet) < 10:
    # logging.warn("too few characters in ASCII output: %s" % charSet)
    # return None

    return fileData
Example #12
0
def toAscii(fileData, mimeType=None, \
        maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \
        minTxtFileSize=pubConf.minTxtFileSize):
    """ pick out the content from the fileData dictionary, 
    write it to a local file in tempDir and convert it to 
    ASCII format. Put output back into the content field.

    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    fileSize = len(fileContent)

    if "locFname" in fileData:
        locFname=fileData["locFname"]
        fileDebugDesc = fileData["externalId"]+":"+locFname
    else:
        locFname = None
        fileDebugDesc = ",".join([fileData["url"],fileData["desc"],
            fileData["fileId"],fileData["articleId"]])

    if fileSize > maxBinFileSize:
        logging.warn("binary file size before conversion %d > %d, skipping file %s" % \
            (len(fileContent), maxBinFileSize, fileDebugDesc))
        return None

    fileExt = getFileExt(fileData, locFname, mimeType)

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % \
            (fileDebugDesc, fileExt))
        return None
    cmdLine = converters[fileExt]

    if cmdLine=="COPY":
        # fileData["content"] already contains ASCII text
        pass
        
    elif cmdLine=="XMLTEXT" or cmdLine=="NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine=="NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData==None:
            logging.debug("Could not convert xml to ascii, file %s" % fileData["url"])
            return None
        fileData["content"]=asciiData

    else:
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        # try to detect corrupted pdf2text output and run second converter
        if fileExt=="pdf" and \
            ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10):
            logging.debug("No data or too many non printable characters in PDF, trying alternative program")
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData==None:
            logging.info("conversion failed for %s" % fileDebugDesc)
            return None
        else:
            fileData["content"]=removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.info("ascii file size after conversion too big, ignoring file %s" % fileDebugDesc)
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \
            (len(fileData["content"]), minTxtFileSize, fileDebugDesc))
        return None

    #charSet = set(fileData["content"])
    #if len(charSet) < 10:
        #logging.warn("too few characters in ASCII output: %s" % charSet)
        #return None

    return fileData
Example #13
0
def toAscii(fileData, mimeType=None, \
        maxBinFileSize=pubConf.maxBinFileSize, maxTxtFileSize=pubConf.maxTxtFileSize, \
        minTxtFileSize=pubConf.minTxtFileSize):
    """ pick out the content from the fileData dictionary,
    write it to a local file in tempDir and convert it to
    ASCII format. Put output back into the content field.

    mimeType will be used if specified, otherwise try to guess
    converter based on url file extension

    returns fileData if successful, otherwise None
    returns only unicode strings (despite the name)
    """
    converters = pubConf.getConverters()
    tempDir = pubConf.getTempDir()

    fileContent = fileData["content"]
    fileSize = len(fileContent)

    if "locFname" in fileData:
        locFname = fileData["locFname"]
        fileDebugDesc = fileData["externalId"] + ":" + locFname
    else:
        locFname = None
        fileDebugDesc = ",".join([
            fileData["url"], fileData["desc"], fileData["fileId"],
            fileData["articleId"]
        ])

    if fileSize > maxBinFileSize:
        logging.warn("binary file size before conversion %d > %d, skipping file %s" % \
            (len(fileContent), maxBinFileSize, fileDebugDesc))
        return None

    fileExt = getFileExt(fileData, locFname, mimeType)

    if fileExt not in converters:
        logging.debug("Could not convert file %s, no converter for extension %s" % \
            (fileDebugDesc, fileExt))
        return None
    cmdLine = converters[fileExt]

    if cmdLine == "COPY":
        # fileData["content"] already contains ASCII text
        pass

    elif cmdLine == "XMLTEXT" or cmdLine == "NXMLTEXT":
        logging.debug("stripping XML tags")
        if cmdLine == "NXMLTEXT":
            asciiData = pubXml.stripXmlTags(fileContent, isNxmlFormat=True)
        else:
            asciiData = pubXml.stripXmlTags(fileContent)

        if asciiData == None:
            logging.debug("Could not convert xml to ascii, file %s" %
                          fileData["url"])
            return None
        fileData["content"] = asciiData

    else:
        asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        # try to detect corrupted pdf2text output and run second converter
        if fileExt=="pdf" and \
            ((asciiData==None or len(asciiData)<minTxtFileSize) or countBadChars(asciiData)>=10):
            logging.debug(
                "No data or too many non printable characters in PDF, trying alternative program"
            )
            cmdLine = converters["pdf2"]
            asciiData = runConverter(cmdLine, fileContent, fileExt, tempDir)

        if asciiData == None:
            logging.info("conversion failed for %s" % fileDebugDesc)
            return None
        else:
            fileData["content"] = removeBadChars(asciiData)

    fileData = dictToUnicode(fileData)

    if len(fileData["content"]) > maxTxtFileSize:
        logging.info(
            "ascii file size after conversion too big, ignoring file %s" %
            fileDebugDesc)
        return None

    if len(fileData["content"]) < minTxtFileSize:
        logging.debug("ascii file size only %d bytes < %d, ignoring %s" % \
            (len(fileData["content"]), minTxtFileSize, fileDebugDesc))
        return None

    #charSet = set(fileData["content"])
    #if len(charSet) < 10:
    #logging.warn("too few characters in ASCII output: %s" % charSet)
    #return None

    return fileData
Example #14
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    loadBlacklist()

    head = open(pdfName).read(30)
    if "<html" in head or "<HTML" in head:
        logging.info("PDF %s is an HTML file, skipping" % pdfName)
        return None

    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)

        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        print "XX", md5Str, list(md5Blacklist)[:10]
        if md5Str in md5Blacklist:
            logging.debug("Image MD5 is blacklisted")
            continue

        data.append((imgId, 0, x, y, md5Str, pngBlob))

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append((imgId, 1, x, y, md5Str, pngBlob))

        imgId += 1

    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data