def convertOneChunk(inIndexFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) doi2pmid = None logging.info("Converting %d files" % len(inRows)) convCount = 0 for row in inRows: # read line i+=1 articleId, baseDir = row.articleId, row.baseDir zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) # open file from zipfile fullZipPath = join(baseDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) if doi2pmid==None: doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() xmlTree = pubXml.etreeFromXml(xmlString) # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]="consyn://"+zipFilename+"/"+filename if articleData["doi"] in doi2pmid: articleData["pmid"] = doi2pmid[articleData["doi"]] pii = splitext(basename(filename))[0] articleData["externalId"]="PII"+pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files" % convCount) store.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]=zipFilename+":"+filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"]=pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u"\xbf" in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename == "": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename + ":" + filename if pdfString == None: pdfNotFound += 1 logging.error("Could not open pdf or xml file") continue articleId = int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"] = zipFilename + "/" + filename articleData["externalId"] = articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId = int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows) - i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"] = zipFilename + ":" + filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"] = pii articleData[ "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString == None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [ str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"]) ] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ diskDir = abspath(join(zipDir, "..", "disk")) store = pubStore.PubWriterFile(outFile) # read all already done IDs doneIds = parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\tpmid\n") pmidFinder = pubCompare.PmidFinder() unzipTmp = pubGeneric.makeTempDir(prefix="pubConvSpringerUnzip", tmpDir=pubConf.getFastTempDir()) maxCommon.delOnExit(unzipTmp) i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) logging.info("Converting %d files" % len(inRows)) convCount = 0 pdfNotFound = 0 for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename if u'\xbf' in filename: logging.info("Found weird character, skipping file") continue articleData = pubStore.createEmptyArticleDict(publisher="springer") if zipFilename=="": xmlString, pdfString = getDiskData(diskDir, filename) articleData["origFile"] = filename else: xmlString, pdfString = getUpdateData(unzipTmp, zipDir, zipFilename, filename) articleData["origFile"] = zipFilename+":"+filename if pdfString==None: pdfNotFound+=1 logging.error("Could not open pdf or xml file") continue articleId=int(articleId) # parse xml logging.debug("Parsing XML") try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue articleData = parseXml(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue if articleData["doi"] in doneIds: logging.error("article %s has already been converted, skipping" % articleData["doi"]) continue articleData["pmid"] = pmidFinder.lookupPmid(articleData) articleData["origFile"]=zipFilename+"/"+filename articleData["externalId"]=articleData["doi"] # convert pdf to ascii fileData = createFileData(articleData, "application/pdf", pdfString) logging.debug("converting pdf to ascii") pubGeneric.toAscii(fileData, "application/pdf") # write to output store.writeArticle(articleId, articleData) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") doneIds.add(articleData["doi"]) convCount += 1 logging.info("Converted %d files, pdfNotFound=%d" % (convCount, pdfNotFound)) store.close() idFh.close()