def finishUp(finalOutDir): " do the final post-batch processing " buildDir = pubGeneric.makeBuildDir(finalOutDir, mustExist=True) minId = pubConf.identifierStart["springer"] pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId) pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId) # cleanup, move over, remove whole temp dir if isdir(indexSplitDir): # necessary? how could it not be there? logging.info("Deleting directory %s" % indexSplitDir) shutil.rmtree(indexSplitDir) # got sometimes exception here... pubStore.moveFiles(buildDir, finalOutDir) shutil.rmtree(buildDir) pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles) pubStore.updateSqlite(finalOutDir)
def rechunk(inDir, outDir): """ Read and write everything in inDir and write to outDir. potentially merges small chunks into bigger chunks """ existOutFnames = glob.glob(join(outDir, "*")) assert (len(existOutFnames) <= 1) # only one "parts" directory allowed artCount = 0 chunkCount = 0 store = None outFnames = [] for reader in pubStore.iterPubReaders(inDir): for article, files in reader.iterArticlesFileList(None): if store == None: outFname = join(outDir, "0_%05d.articles.gz" % chunkCount) store = pubStore.PubWriterFile(outFname) logging.debug("Writing to %s" % outFname) #logging.debug("Adding %s, %d files" % (article.externalId, len(files))) artDict = article._asdict() fileDicts = [] for fileRow in files: #store.writeFile(article.articleId, fileRow.fileId, fileRow._asdict()) fileDicts.append(fileRow._asdict()) store.writeDocs(artDict, fileDicts) #store.writeArticle(article.articleId, article._asdict()) artCount += 1 if artCount % pubConf.chunkArticleCount == 0: store.close() store = None chunkCount += 1 if artCount % pubConf.chunkArticleCount != 0: outFnames.append(outFname) logging.info("Created %d chunks with %d article" % (chunkCount + 1, artCount)) if store != None: store.close() pubStore.updateSqlite(outDir)
def rechunk(inDir, outDir): """ Read and write everything in inDir and write to outDir. potentially merges small chunks into bigger chunks """ existOutFnames = glob.glob(join(outDir, "*")) assert(len(existOutFnames)<=1) # only one "parts" directory allowed artCount = 0 chunkCount = 0 store = None outFnames = [] for reader in pubStore.iterPubReaders(inDir): for article, files in reader.iterArticlesFileList(None): if store==None: outFname = join(outDir, "0_%05d.articles.gz" % chunkCount) store = pubStore.PubWriterFile(outFname) logging.debug("Writing to %s" % outFname) #logging.debug("Adding %s, %d files" % (article.externalId, len(files))) artDict = article._asdict() fileDicts = [] for fileRow in files: #store.writeFile(article.articleId, fileRow.fileId, fileRow._asdict()) fileDicts.append(fileRow._asdict()) store.writeDocs(artDict, fileDicts) #store.writeArticle(article.articleId, article._asdict()) artCount += 1 if artCount % pubConf.chunkArticleCount == 0: store.close() store = None chunkCount += 1 if artCount % pubConf.chunkArticleCount !=0: outFnames.append(outFname) logging.info("Created %d chunks with %d article" % (chunkCount+1, artCount)) if store!=None: store.close() pubStore.updateSqlite(outDir)