def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase + ".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase + "_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile == None or lastTsvFname != row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert (url == row.url) assert (len(content) != 0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict == None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict == None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId * 1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
def convertOneChunk(gzDir, idFname, inIndexFile, outFile): # for each row in index: store = pubStore.PubWriterFile(outFile) donePiis = pubGeneric.parseDoneIds(idFname) # log to file outBase = join(dirname(outFile), basename(outFile).split(".")[0]) logFname = outBase+".log" pubGeneric.setupLogging(__file__, None, logFileName=logFname) idFname = outBase+"_ids.tab" logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\texternalId\n") lastTsvFname = None tsvFile = None pmidFinder = pubCompare.PmidFinder() for row in maxCommon.iterTsvRows(inIndexFile, encoding=None): # open file and seek, if necessry if tsvFile==None or lastTsvFname!=row.tsvFile: logging.debug("Seeking to %s in tsvfile %s" % (row.offset, row.tsvFile)) tsvFile = gzip.open(join(gzDir, row.tsvFile)) tsvFile.seek(int(row.offset)) lastTsvFname = row.tsvFile line = tsvFile.readline() if row.url.startswith("!"): logging.info("Ignoring %s, marked as duplicated" % row.url) continue #fields are: ["articleId", "tsvFile", "url", "offset"] fields = line.split("\t") url = fields[0] logging.debug("Replacing weird bing chars") content = fields[-1] assert(url==row.url) assert(len(content)!=0) url = url.decode("utf8") logging.debug("Converting to text") content = convertMicrosoft(content) artDict, fileDict = convertHtmlToDicts(url, content) if artDict==None: artDict, fileDict = minimalHtmlToDicts(url, content) if artDict==None: continue artDict["pmid"] = pmidFinder.lookupPmid(artDict) # write file articleId = int(row.articleId) fileId = articleId*1000 store.writeFile(articleId, fileId, fileDict) store.writeArticle(articleId, artDict) store.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0]+".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i+=1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId=int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows)-i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData==None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"]=zipFilename+":"+filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"]=pii articleData["fulltextUrl"]="http://www.sciencedirect.com/science/svapps/pii/"+pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString==None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"])] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000*(articleId))+1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()
def convertOneChunk(zipDir, inIndexFile, inIdFile, outFile): """ get files from inIndexFile, parse Xml, write everything to outfile in ascii format """ store = pubStore.PubWriterFile(outFile) # read all already done IDs donePiis = pubGeneric.parseDoneIds(inIdFile) # open output id files idFname = join(dirname(outFile), basename(outFile).split(".")[0] + ".ids.tab") logging.debug("Writing ids to %s" % idFname) idFh = open(idFname, "w") idFh.write("#articleId\tdoi\texternalId\tpmid\n") i = 0 inRows = list(maxCommon.iterTsvRows(inIndexFile)) #doi2pmid = None convCount = 0 skipCount = 0 pmidFinder = pubCompare.PmidFinder() logging.info("Converting %d files" % len(inRows)) for row in inRows: # read line i += 1 articleId = row.articleId zipFilename, filename = row.zipFilename, row.filename articleId = int(articleId) pii = splitext(basename(filename))[0] if pii in donePiis: logging.debug("PII %s has already been converted, skipping" % pii) skipCount += 1 continue donePiis.add(pii) # open file from zipfile fullZipPath = join(zipDir, zipFilename) zipFile = zipfile.ZipFile(fullZipPath) logging.debug("Parsing %s, file %s, %d files left" % (fullZipPath, filename, len(inRows) - i)) #if doi2pmid==None: #doi2pmid = parseDoi2Pmid(baseDir) xmlString = zipFile.open(filename).read() try: xmlTree = pubXml.etreeFromXml(xmlString) except lxml.etree.XMLSyntaxError: logging.error("XML parse error, skipping file %s, %s" % (zipFilename, filename)) continue # parse xml articleData = pubStore.createEmptyArticleDict(publisher="elsevier") articleData = parseElsevier(xmlTree, articleData) if articleData == None: logging.warn("Parser got no data for %s" % filename) continue articleData["origFile"] = zipFilename + ":" + filename #if articleData["doi"] in doi2pmid: #articleData["pmid"] = doi2pmid[articleData["doi"]] articleData["externalId"] = pii articleData[ "fulltextUrl"] = "http://www.sciencedirect.com/science/svapps/pii/" + pii #articleData["pmid"] = pmidFinder.lookupPmid(articleData) # convert to ascii asciiString, mimeType = treeToAscii_Elsevier(xmlTree) if asciiString == None: logging.warn("No ASCII for %s / %s" % (zipFilename, filename)) continue store.writeArticle(articleId, articleData) # write IDs to separate file idRow = [ str(articleData["articleId"]), articleData["doi"], articleData["externalId"], str(articleData["pmid"]) ] idFh.write("\t".join(idRow)) idFh.write("\n") # write to output fileData = createFileData(articleData, mimeType, asciiString) store.writeFile(articleId, (1000 * (articleId)) + 1, fileData, externalId=articleData["externalId"]) convCount += 1 logging.info("Converted %d files, skipped %d" % (convCount, skipCount)) store.close() idFh.close()