Example #1
0
def finishUp(finalOutDir):
    " do the final post-batch processing "
    buildDir = pubGeneric.makeBuildDir(finalOutDir, mustExist=True)

    minId = pubConf.identifierStart["springer"]

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    # cleanup, move over, remove whole temp dir
    if isdir(indexSplitDir):  # necessary? how could it not be there?
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir)  # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
    pubStore.updateSqlite(finalOutDir)
Example #2
0
def finishUp(finalOutDir):
    " do the final post-batch processing "
    buildDir = pubGeneric.makeBuildDir(finalOutDir, mustExist=True)

    minId = pubConf.identifierStart["springer"]

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    # cleanup, move over, remove whole temp dir
    if isdir(indexSplitDir): # necessary? how could it not be there? 
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir) # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
    pubStore.updateSqlite(finalOutDir)
Example #3
0
def rechunk(inDir, outDir):
    """ Read and write everything in inDir and write to outDir. potentially
    merges small chunks into bigger chunks """
    existOutFnames = glob.glob(join(outDir, "*"))
    assert (len(existOutFnames) <= 1)  # only one "parts" directory allowed
    artCount = 0
    chunkCount = 0
    store = None
    outFnames = []
    for reader in pubStore.iterPubReaders(inDir):
        for article, files in reader.iterArticlesFileList(None):
            if store == None:
                outFname = join(outDir, "0_%05d.articles.gz" % chunkCount)
                store = pubStore.PubWriterFile(outFname)
                logging.debug("Writing to %s" % outFname)

            #logging.debug("Adding %s, %d files" % (article.externalId, len(files)))
            artDict = article._asdict()
            fileDicts = []
            for fileRow in files:
                #store.writeFile(article.articleId, fileRow.fileId, fileRow._asdict())
                fileDicts.append(fileRow._asdict())
            store.writeDocs(artDict, fileDicts)
            #store.writeArticle(article.articleId, article._asdict())

            artCount += 1
            if artCount % pubConf.chunkArticleCount == 0:
                store.close()
                store = None
                chunkCount += 1

    if artCount % pubConf.chunkArticleCount != 0:
        outFnames.append(outFname)

    logging.info("Created %d chunks with %d article" %
                 (chunkCount + 1, artCount))
    if store != None:
        store.close()
    pubStore.updateSqlite(outDir)
Example #4
0
def rechunk(inDir, outDir):
    """ Read and write everything in inDir and write to outDir. potentially
    merges small chunks into bigger chunks """ 
    existOutFnames = glob.glob(join(outDir, "*"))
    assert(len(existOutFnames)<=1) # only one "parts" directory allowed
    artCount = 0
    chunkCount = 0
    store = None
    outFnames = []
    for reader in pubStore.iterPubReaders(inDir):
        for article, files in reader.iterArticlesFileList(None):
            if store==None:
                outFname = join(outDir, "0_%05d.articles.gz" % chunkCount)
                store = pubStore.PubWriterFile(outFname)
                logging.debug("Writing to %s" % outFname)

            #logging.debug("Adding %s, %d files" % (article.externalId, len(files)))
            artDict = article._asdict()
            fileDicts = []
            for fileRow in files:
                #store.writeFile(article.articleId, fileRow.fileId, fileRow._asdict())
                fileDicts.append(fileRow._asdict())
            store.writeDocs(artDict, fileDicts)
            #store.writeArticle(article.articleId, article._asdict())

            artCount += 1
            if artCount % pubConf.chunkArticleCount == 0:
                store.close()
                store = None
                chunkCount += 1

    if artCount % pubConf.chunkArticleCount !=0:
        outFnames.append(outFname)

    logging.info("Created %d chunks with %d article" % (chunkCount+1, artCount))
    if store!=None:
        store.close()
    pubStore.updateSqlite(outDir)