Example #1
0
def makeLocalTempFile():
    " create tmp file on local harddisk, delete on program exit "
    fd, tmpOutFname = tempfile.mkstemp(dir=pubConf.getTempDir(), prefix="pubRun", suffix=".tab")
    os.close(fd)
    logging.debug("local temporary file is %s" % tmpOutFname)
    maxCommon.delOnExit(tmpOutFname)
    return tmpOutFname
Example #2
0
    def __init__(self, fileDataFilename):
        self.articlesWritten = 0
        self.filesWritten = 0
        self.tempDir = pubConf.getTempDir()

        # very convoluted way to find output filenames
        # needed because of parasol 
        outDir = os.path.dirname(fileDataFilename)
        fileDataBasename = os.path.basename(fileDataFilename)
        chunkId = fileDataBasename.split(".")[0]
        self.fileBaseName = chunkId+".files"
        articleBaseName = chunkId+".articles"
        refBaseName = chunkId+".refs"

        self.finalArticleName = join(outDir, articleBaseName+".gz")
        self.finalFileDataName    = join(outDir, self.fileBaseName+".gz")

        # setup reference table handle
        self.refDir = join(outDir, "refs")
        self.finalRefFname = join(self.refDir, refBaseName+".gz")
        self.tempRefName = join(self.tempDir, refBaseName)
        self.refFh = None

        # setup file and article table handles
        fileFname = os.path.join(self.tempDir, self.fileBaseName)
        self.fileFh = codecs.open(fileFname, "w", encoding="utf8")
        self.fileFh.write("#"+"\t".join(fileDataFields)+"\n")
        maxCommon.delOnExit(fileFname)

        articleFname = os.path.join(self.tempDir, articleBaseName)
        self.articleFh = codecs.open(articleFname, "w", encoding="utf8") 
        self.articleFh.write("#"+"\t".join(articleFields)+"\n")
        maxCommon.delOnExit(articleFname)

        self.outFilename = os.path.join(outDir, fileDataBasename)
Example #3
0
def runConverter(cmdLine, fileContent, fileExt, tempDir):
    """ create local temp for in and output files, write data to infile, run
    command. file can be supplied as a str in fileContent["content"] or
    alternatively as a pathname via 'locFname' """
    # create temp file
    fd, inFname = tempfile.mkstemp(suffix="."+fileExt, dir=tempDir, prefix="pubConvPmc.in.")
    logging.debug("Created %s" % inFname)
    maxCommon.delOnExit(inFname)

    inFile = os.fdopen(fd, "wb")
    inFile.write(fileContent)
    inFile.close()

    logging.debug("Created in temp file %s" % (inFname))

    fd, outFname = tempfile.mkstemp(suffix=".txt", dir=tempDir, prefix="pubConvPmc.out.")
    maxCommon.delOnExit(outFname)
    os.close(fd)
    logging.debug("Created out temp file %s" % (outFname))

    # allow %(name)s syntax in cmdLine string to use variables from pubConf
    cmdLine = cmdLine % pubConf.__dict__
    # build cmd line and run
    cmdLine = cmdLine.replace("$in", inFname)
    cmdLine = cmdLine.replace("$out", outFname)
    logging.debug("running "+cmdLine)
    skipFile=False
    stdout, stderr, ret = runCommandTimeout(cmdLine, bufSize=10000000, timeout=30)

    asciiData = None

    if ret==2 and "docx2txt" not in cmdLine: # docx2text returns exit code 2 in some cases
        logging.error("stopped on errno 2: looks like you pressed ctrl-c")
        sys.exit(2)

    if ret!=0:
        logging.error("error %d occured while executing %s" % (ret, cmdLine))
        printOut(stdout, stderr)
        skipFile=True
    if os.path.getsize(outFname)==0:
        logging.error("zero file size of output file after command %s" % (cmdLine))
        printOut(stdout, stderr)
        skipFile=True

    if not skipFile:
        asciiData = open(outFname).read()

    logging.debug("Removing %s" % inFname)
    os.remove(inFname)
    os.remove(outFname)

    asciiData = forceToUnicode(asciiData)

    if skipFile:
        return None
    else:
        return asciiData
Example #4
0
def runConverter(cmdLine, fileContent, fileExt, tempDir):
    """ create local temp for in and output files, write data to infile, run
    command. file can be supplied as a str in fileContent["content"] or
    alternatively as a pathname via 'locFname' """
    # create temp file
    fd, inFname = tempfile.mkstemp(suffix="."+fileExt, dir=tempDir, prefix="pubConvPmc.in.")
    logging.debug("Created %s" % inFname)
    maxCommon.delOnExit(inFname)

    inFile = os.fdopen(fd, "wb")
    inFile.write(fileContent)
    inFile.close()

    logging.debug("Created in temp file %s" % (inFname))

    fd, outFname = tempfile.mkstemp(suffix=".txt", dir=tempDir, prefix="pubConvPmc.out.")
    maxCommon.delOnExit(outFname)
    os.close(fd)
    logging.debug("Created out temp file %s" % (outFname))

    # allow %(name)s syntax in cmdLine string to use variables from pubConf
    cmdLine = cmdLine % pubConf.__dict__
    # build cmd line and run
    cmdLine = cmdLine.replace("$in", inFname)
    cmdLine = cmdLine.replace("$out", outFname)
    logging.debug("running "+cmdLine)
    skipFile=False
    stdout, stderr, ret = runCommandTimeout(cmdLine, bufSize=10000000, timeout=30)

    asciiData = None

    if ret==2 and "docx2txt" not in cmdLine: # docx2text returns exit code 2 in some cases
        logging.error("stopped on errno 2: looks like you pressed ctrl-c")
        sys.exit(2)

    if ret!=0:
        logging.error("error %d occured while executing %s" % (ret, cmdLine))
        printOut(stdout, stderr)
        skipFile=True
    if os.path.getsize(outFname)==0:
        logging.error("zero file size of output file after command %s" % (cmdLine))
        printOut(stdout, stderr)
        skipFile=True

    if not skipFile:
        asciiData = open(outFname).read()

    logging.debug("Removing %s" % inFname)
    os.remove(inFname)
    os.remove(outFname)

    asciiData = forceToUnicode(asciiData)

    if skipFile:
        return None
    else:
        return asciiData
Example #5
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)
        
        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 0, x, y, md5Str, pngBlob) )

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append( (imgId, 1, x, y, md5Str, pngBlob) )

        imgId += 1
            
    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data
Example #6
0
def startRedis(dbFname):
    """ starts redis on current server as daemon.
    Creates status files with filename dbName".pid" and dbName".host". Returns the port.

    >>> import pubGeneric
    >>> pubGeneric.setupLogging(__file__, None)
    >>> h, p = startRedis("/tmp/test.tab.gz")
    >>> r = redis.Redis(port=p)
    >>> r.set("hello", "world")
    True
    >>> r.get("hello")
    'world'
    >>> r.get("world")
    >>> r.shutdown()
    """
    dbFname = abspath(dbFname)
    pidFname = dbFname + ".pid"
    port = findFreePort()
    dirName = dirname(dbFname)
    baseName = basename(dbFname) + ".rdb"

    hostFname = dbFname + ".host"
    hostname = socket.gethostbyname("localhost")
    hostDesc = hostname + ":" + str(port)
    open(hostFname, "w").write(hostDesc)
    logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname))
    maxCommon.delOnExit(hostFname)
    maxCommon.delOnExit(pidFname)
    atexit.register(shutdownRedisServers)
    global redisPorts
    redisPorts.append(port)

    cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \
        "--port", str(port), "--rdbchecksum", "no", "--dir", dirName,
        "--dbfilename", baseName, "--maxmemory", "200gb"]
    logging.info("Starting up redis server on localhost")
    maxCommon.runCommand(cmd)

    # wait until startup is complete
    redisStart = True
    while redisStart:
        try:
            r = redis.Redis(port=port)
            dbSize = r.dbsize()
            redisStart = False
        except redis.ConnectionError:
            logging.info("Waiting for 1 sec for redis startup completion")
            time.sleep(1)
            pass
    logging.info("Redis startup completed, dbSize=%d" % dbSize)

    return "localhost", port
Example #7
0
def startRedis(dbFname):
    """ starts redis on current server as daemon.
    Creates status files with filename dbName".pid" and dbName".host". Returns the port.

    >>> import pubGeneric
    >>> pubGeneric.setupLogging(__file__, None)
    >>> h, p = startRedis("/tmp/test.tab.gz")
    >>> r = redis.Redis(port=p)
    >>> r.set("hello", "world")
    True
    >>> r.get("hello")
    'world'
    >>> r.get("world")
    >>> r.shutdown()
    """
    dbFname = abspath(dbFname)
    pidFname  = dbFname+".pid"
    port      = findFreePort()
    dirName   = dirname(dbFname)
    baseName  = basename(dbFname)+".rdb"

    hostFname = dbFname+".host"
    hostname  = socket.gethostbyname("localhost")
    hostDesc  = hostname+":"+str(port)
    open(hostFname, "w").write(hostDesc)
    logging.info("Wrote redis host info %s to %s" % (hostDesc, hostFname))
    maxCommon.delOnExit(hostFname)
    maxCommon.delOnExit(pidFname)
    atexit.register(shutdownRedisServers)
    global redisPorts
    redisPorts.append(port)

    cmd = ["redis-server", "--daemonize", "yes", "--pidfile", pidFname, \
        "--port", str(port), "--rdbchecksum", "no", "--dir", dirName,
        "--dbfilename", baseName, "--maxmemory", "200gb"]
    logging.info("Starting up redis server on localhost")
    maxCommon.runCommand(cmd)

    # wait until startup is complete
    redisStart = True
    while redisStart:
        try:
            r = redis.Redis(port=port)
            dbSize = r.dbsize()
            redisStart=False
        except redis.ConnectionError:
            logging.info("Waiting for 1 sec for redis startup completion")
            time.sleep(1)
            pass
    logging.info("Redis startup completed, dbSize=%d" % dbSize)

    return "localhost", port
Example #8
0
    def __init__(self, fname, singleProcess=False, newDb=False, tmpDir=None, onlyKey=False, compress=False, keyIsInt=False, eightBit=False, onlyUnique=False):
        self.onlyUnique = onlyUnique
        self.compress = compress
        self.batchMaxSize = 100000
        self.batch = []
        self.finalDbName = None
        self.onlyKey = onlyKey
        self.dbName = "%s.sqlite" % fname
        if newDb and isfile(self.dbName):
            os.remove(self.dbName)
        isolLevel = None
        self.singleProcess = singleProcess
        if singleProcess:
            isolLevel = "exclusive"
        self.con = None
        if not os.path.isfile(self.dbName) and tmpDir!=None:
            # create a new temp db on ramdisk
            self.finalDbName = self.dbName
            #self.dbName = join(pubConf.getFastTempDir(), basename(self.dbName))
            self.dbName = join(tmpDir, basename(self.dbName))
            logging.debug("Creating new temp db on ramdisk %s" % self.dbName)
            if isfile(self.dbName):
                os.remove(self.dbName)
            maxCommon.delOnExit(self.dbName) # make sure this is deleted on exit
        try:
            self.con = sqlite3.connect(self.dbName)
        except sqlite3.OperationalError:
            logging.error("Could not open %s" % self.dbName)
            raise

        logging.debug("Opening sqlite DB %s" % self.dbName)

        keyType = "TEXT"
        if keyIsInt:
            keyType = "INT"
        if onlyKey:
            self.con.execute("CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY)" % keyType)
        else:
            self.con.execute("CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY,value BLOB)" % keyType)
        self.con.commit()

        self.cur = self.con
        if singleProcess:
            self.cur.execute("PRAGMA synchronous=OFF") # recommended by
            self.cur.execute("PRAGMA count_changes=OFF") # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
            self.cur.execute("PRAGMA cache_size=8000000") # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
            self.cur.execute("PRAGMA journal_mode=OFF") # http://www.sqlite.org/pragma.html#pragma_journal_mode
            self.cur.execute("PRAGMA temp_store=memory")
            self.con.commit()

        if eightBit:
            self.con.text_factory = str
Example #9
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd =["unzip", "-d",tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line=="":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert(len(tmpFnames)==1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName+".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Example #10
0
def indexTsv(zipFname, tsvName, outFname):
    """ unzip a zipfile, recompress all the tsvs inside
    with gzip and create an .index.gz for them"""

    #def indexTsv(zipFname, tsvName, outFname, bgzipPath):

    # extract to local disk
    tmpDir = pubGeneric.makeTempDir("bingData")
    maxCommon.delOnExit(tmpDir)
    logging.info("Extracting to %s" % tmpDir)
    cmd = ["unzip", "-d", tmpDir, zipFname]
    maxCommon.runCommand(cmd)

    tempFname = join(tmpDir, tsvName)
    logging.info("Indexing %s to %s" % (tempFname, outFname))
    # index lines
    ofh = gzip.open(outFname, "w")
    ifh = open(tempFname, "rb")
    offset = 0
    # the file iterator does not work  with tell()!!
    #for line in ifh:
    while True:
        line = ifh.readline()
        if line == "":
            break
        url = line[0:line.find("\t")]
        ofh.write("%s\t%d\n" % (url, offset))
        #logging.debug("url %s, offset %d" % (url, offset))
        offset = ifh.tell()
    ofh.close()

    # re-compress with gzip
    tmpFnames = glob.glob(join(tmpDir, "*.tsv"))
    assert (len(tmpFnames) == 1)
    tmpFname = tmpFnames[0]
    zipDir = dirname(zipFname)
    finalFname = join(zipDir, tsvName + ".gz")
    logging.info("Compressing to %s" % finalFname)
    #cmd = "%s %s -c > %s" % (bgzipPath, tmpFname, finalFname)
    cmd = "gzip %s -c > %s" % (tmpFname, finalFname)
    maxCommon.runCommand(cmd)
    shutil.rmtree(tmpDir)
Example #11
0
    def writeRefs(self, artDict, refRows):
        " write references to table in refs/ subdir"
        if self.refFh==None:
            # lazily open ref file, add headers
            if not os.path.isdir(self.refDir):
                try:
                    os.makedirs(self.refDir)
                except OSError:
                    logging.info("makedir %s failed, probably just race condition" % self.refDir)
                    pass
            #self.refFh = tempfile.NamedTemporaryFile(dir=self.tempDir, suffix=".gz", prefix=self.fileBaseName+".")
            self.refFh = open(self.tempRefName, "w")
            logging.info("Created tempfile for refs %s" % self.refFh.name)
            maxCommon.delOnExit(self.tempRefName)

            refHeaders = copy.copy(refArtFields)
            refHeaders.extend(refFields)
            self.refFh.write("#"+"\t".join(refHeaders)+"\n")

        # prepare a list of article IDs of the source article
        srcArtFields = []
        for artField in refArtFields:
            if artField=="artDoi":
                artField="doi"
            if artField=="artPmid":
                artField="pmid"
            artVal = artDict[artField]
            srcArtFields.append(artVal)
        srcPrefix = "\t".join(srcArtFields)+"\t"

        # output all references
        logging.debug("Writing %d references for article %s" % (len(refRows), artDict["externalId"]))
        for ref in refRows:
            # output the source article IDs
            self.refFh.write(srcPrefix.encode("utf8"))

            # output the reference article fields
            self.refFh.write(u'\t'.join(ref).encode("utf8"))
            self.refFh.write("\n")
Example #12
0
def getImages(pdfName):
    """ returns a list of tuples 
    (imgId (int), isThumbnail (int), width, height, md5sum, PNGBinarydataBlob) extracted from pdfName.
    returns two tuples per image, one is the original, one is the thumbnail.
    """
    loadBlacklist()

    head = open(pdfName).read(30)
    if "<html" in head or "<HTML" in head:
        logging.info("PDF %s is an HTML file, skipping" % pdfName)
        return None

    logging.debug("Extracting images from %s" % pdfName)
    tempDir = tempfile.mkdtemp(prefix="pdfimages", dir=pubConf.getTempDir())
    maxCommon.delOnExit(tempDir)
    outStem = join(tempDir, "img")
    cmd = "pdfimages %s %s" % (pdfName, outStem)
    maxCommon.runCommand(cmd)

    # convert to png
    data = []
    imgId = 0
    for fname in glob.glob(join(tempDir, "*.ppm")):
        logging.debug("got image %s" % fname)
        x, y = pbmSize(open(fname))
        if not looksInteresting(x, y):
            logging.debug("Image is too small or too long/wide")
            continue

        logging.debug("Loading image into sqlite")
        outFname = "%s.png" % fname
        cmd = "convert %s %s" % (fname, outFname)
        maxCommon.runCommand(cmd)

        pngBlob = open(outFname).read()
        md5Str = makeMd5(pngBlob)

        print "XX", md5Str, list(md5Blacklist)[:10]
        if md5Str in md5Blacklist:
            logging.debug("Image MD5 is blacklisted")
            continue

        data.append((imgId, 0, x, y, md5Str, pngBlob))

        # make the thumbnail
        thumbFName = "%s.thumb.png" % fname
        # see https://www.smashingmagazine.com/2015/06/efficient-image-resizing-with-imagemagick/
        # but can't use -posterize 136 on centos6
        cmd = "convert -filter Triangle -define filter:support=2 -thumbnail %d " \
            "-unsharp 0.25x0.25+8+0.065 -dither None -quality 82 -define png:compression-filter=5 " \
            "-define png:compression-level=9 -define png:compression-strategy=1 " \
            "-define png:exclude-chunk=all -interlace none -colorspace " \
            "sRGB -strip %s %s" % (WIDTH, fname, thumbFName)
        maxCommon.runCommand(cmd)

        x, y = pngDimensions(thumbFName)
        pngBlob = open(thumbFName).read()
        md5Str = makeMd5(pngBlob)

        data.append((imgId, 1, x, y, md5Str, pngBlob))

        imgId += 1

    shutil.rmtree(tempDir)
    maxCommon.ignoreOnExit(tempDir)
    return data
Example #13
0
def getFastUniqueTempFname():
    " create unique tempdir on ramdisk, delete on exit "
    tempFname = tempfile.mktemp(dir=pubConf.getFastTempDir())
    maxCommon.delOnExit(tempFname)
    return tempFname
Example #14
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
Example #15
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")
            continue

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")
            continue

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
Example #16
0
def getFastUniqueTempFname():
    " create unique tempdir on ramdisk, delete on exit "
    tempFname = tempfile.mktemp(dir=pubConf.getFastTempDir())
    maxCommon.delOnExit(tempFname)
    return tempFname
Example #17
0
    def __init__(self,
                 fname,
                 singleProcess=False,
                 newDb=False,
                 tmpDir=None,
                 onlyKey=False,
                 compress=False,
                 keyIsInt=False,
                 eightBit=False,
                 onlyUnique=False):
        self.onlyUnique = onlyUnique
        self.compress = compress
        self.batchMaxSize = 100000
        self.batch = []
        self.finalDbName = None
        self.onlyKey = onlyKey
        self.dbName = "%s.sqlite" % fname
        if newDb and isfile(self.dbName):
            os.remove(self.dbName)
        isolLevel = None
        self.singleProcess = singleProcess
        if singleProcess:
            isolLevel = "exclusive"
        self.con = None
        if not os.path.isfile(self.dbName) and tmpDir != None:
            # create a new temp db on ramdisk
            self.finalDbName = self.dbName
            #self.dbName = join(pubConf.getFastTempDir(), basename(self.dbName))
            self.dbName = join(tmpDir, basename(self.dbName))
            logging.debug("Creating new temp db on ramdisk %s" % self.dbName)
            if isfile(self.dbName):
                os.remove(self.dbName)
            maxCommon.delOnExit(
                self.dbName)  # make sure this is deleted on exit
        try:
            self.con = sqlite3.connect(self.dbName)
        except sqlite3.OperationalError:
            logging.error("Could not open %s" % self.dbName)
            raise

        logging.debug("Opening sqlite DB %s" % self.dbName)

        keyType = "TEXT"
        if keyIsInt:
            keyType = "INT"
        if onlyKey:
            self.con.execute(
                "CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY)" %
                keyType)
        else:
            self.con.execute(
                "CREATE TABLE IF NOT EXISTS data (key %s PRIMARY KEY,value BLOB)"
                % keyType)
        self.con.commit()

        self.cur = self.con
        if singleProcess:
            self.cur.execute("PRAGMA synchronous=OFF")  # recommended by
            self.cur.execute(
                "PRAGMA count_changes=OFF"
            )  # http://blog.quibb.org/2010/08/fast-bulk-inserts-into-sqlite/
            self.cur.execute(
                "PRAGMA cache_size=8000000"
            )  # http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
            self.cur.execute(
                "PRAGMA journal_mode=OFF"
            )  # http://www.sqlite.org/pragma.html#pragma_journal_mode
            self.cur.execute("PRAGMA temp_store=memory")
            self.con.commit()

        if eightBit:
            self.con.text_factory = str