Example #1
0
def convertOneChunk(inIndexFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    doi2pmid = None
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    for row in inRows:
        # read line
        i+=1
        articleId, baseDir = row.articleId, row.baseDir
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        # open file from zipfile
        fullZipPath = join(baseDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        if doi2pmid==None:
            doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        xmlTree   = pubXml.etreeFromXml(xmlString)

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]="consyn://"+zipFilename+"/"+filename
        if articleData["doi"] in doi2pmid:
           articleData["pmid"] = doi2pmid[articleData["doi"]]

        pii = splitext(basename(filename))[0]
        articleData["externalId"]="PII"+pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files" % convCount)
    store.close()
Example #2
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId=int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i))
        #if doi2pmid==None:
            #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"]=zipFilename+":"+filename
        #if articleData["doi"] in doi2pmid:
           #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"]=pii
        articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString==None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
Example #3
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u"\xbf" in filename:
            logging.info("Found weird character, skipping file")
            continue

        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename == "":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename + ":" + filename

        if pdfString == None:
            pdfNotFound += 1
            logging.error("Could not open pdf or xml file")
            continue

        articleId = int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"] = zipFilename + "/" + filename
        articleData["externalId"] = articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()
Example #4
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """
    store = pubStore.PubWriterFile(outFile)
    # read all already done IDs
    donePiis = pubGeneric.parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile),
                   basename(outFile).split(".")[0] + ".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\texternalId\tpmid\n")

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    #doi2pmid = None
    convCount = 0
    skipCount = 0
    pmidFinder = pubCompare.PmidFinder()
    logging.info("Converting %d files" % len(inRows))
    for row in inRows:
        # read line
        i += 1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename
        articleId = int(articleId)

        pii = splitext(basename(filename))[0]
        if pii in donePiis:
            logging.debug("PII %s has already been converted, skipping" % pii)
            skipCount += 1
            continue
        donePiis.add(pii)

        # open file from zipfile
        fullZipPath = join(zipDir, zipFilename)
        zipFile = zipfile.ZipFile(fullZipPath)
        logging.debug("Parsing %s, file %s, %d files left" %
                      (fullZipPath, filename, len(inRows) - i))
        #if doi2pmid==None:
        #doi2pmid = parseDoi2Pmid(baseDir)
        xmlString = zipFile.open(filename).read()
        try:
            xmlTree = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" %
                          (zipFilename, filename))
            continue

        # parse xml
        articleData = pubStore.createEmptyArticleDict(publisher="elsevier")
        articleData = parseElsevier(xmlTree, articleData)
        if articleData == None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        articleData["origFile"] = zipFilename + ":" + filename
        #if articleData["doi"] in doi2pmid:
        #articleData["pmid"] = doi2pmid[articleData["doi"]]

        articleData["externalId"] = pii
        articleData[
            "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii
        #articleData["pmid"]  = pmidFinder.lookupPmid(articleData)

        # convert to ascii
        asciiString, mimeType = treeToAscii_Elsevier(xmlTree)
        if asciiString == None:
            logging.warn("No ASCII for %s / %s" % (zipFilename, filename))
            continue
        store.writeArticle(articleId, articleData)

        # write IDs to separate file
        idRow = [
            str(articleData["articleId"]), articleData["doi"],
            articleData["externalId"],
            str(articleData["pmid"])
        ]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        # write to output
        fileData = createFileData(articleData, mimeType, asciiString)
        store.writeFile(articleId, (1000 * (articleId)) + 1,
                        fileData,
                        externalId=articleData["externalId"])
        convCount += 1
    logging.info("Converted %d files, skipped %d" % (convCount, skipCount))
    store.close()
    idFh.close()
Example #5
0
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile):
    """ 
    get files from inIndexFile, parse Xml, 
    write everything to outfile in ascii format
    """ 
    diskDir = abspath(join(zipDir, "..", "disk"))

    store = pubStore.PubWriterFile(outFile)

    # read all already done IDs
    doneIds = parseDoneIds(inIdFile)

    # open output id files
    idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab")
    logging.debug("Writing ids to %s" % idFname)
    idFh = open(idFname, "w")
    idFh.write("#articleId\tdoi\tpmid\n")

    pmidFinder = pubCompare.PmidFinder()

    unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir())
    maxCommon.delOnExit(unzipTmp)

    i = 0
    inRows = list(maxCommon.iterTsvRows(inIndexFile))
    logging.info("Converting %d files" % len(inRows))
    convCount = 0
    pdfNotFound = 0
    for row in inRows:
        # read line
        i+=1
        articleId = row.articleId
        zipFilename, filename = row.zipFilename, row.filename

        if u'\xbf' in filename:
            logging.info("Found weird character, skipping file")
            continue
        
        articleData = pubStore.createEmptyArticleDict(publisher="springer")
        if zipFilename=="":
            xmlString, pdfString = getDiskData(diskDir, filename)
            articleData["origFile"] = filename
        else:
            xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename)
            articleData["origFile"] = zipFilename+":"+filename

        if pdfString==None:
            pdfNotFound+=1
            logging.error("Could not open pdf or xml file")
            continue

        articleId=int(articleId)

        # parse xml
        logging.debug("Parsing XML")
        try:
            xmlTree   = pubXml.etreeFromXml(xmlString)
        except lxml.etree.XMLSyntaxError:
            logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename))
            continue

        articleData = parseXml(xmlTree, articleData)

        if articleData==None:
            logging.warn("Parser got no data for %s" % filename)
            continue
        if articleData["doi"] in doneIds:
            logging.error("article %s has already been converted, skipping" % articleData["doi"])
            continue

        articleData["pmid"] = pmidFinder.lookupPmid(articleData)
        articleData["origFile"]=zipFilename+"/"+filename
        articleData["externalId"]=articleData["doi"]

        # convert pdf to ascii
        fileData = createFileData(articleData, "application/pdf", pdfString)
        logging.debug("converting pdf to ascii")
        pubGeneric.toAscii(fileData, "application/pdf")

        # write to output
        store.writeArticle(articleId, articleData)
        store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"])

        # write IDs to separate file 
        idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])]
        idFh.write("\t".join(idRow))
        idFh.write("\n")

        doneIds.add(articleData["doi"])

        convCount += 1
    logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound))
    store.close()
    idFh.close()