Python makeBuildDir Exemples, pubGeneric.makeBuildDir Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : pubConvSpringer.py Projet : maximilianh/pubMunch

def createChunksSubmitJobs(inDir, finalOutDir, runner, chunkSize):
    """ submit jobs to convert zip and disk files from inDir to outDir
        split files into chunks and submit chunks to cluster system
        write first to temporary dir, and copy over at end of all jobs
        This is based on pubConvElsevier.py
    """
    maxCommon.mustExistDir(finalOutDir)
    minId = pubConf.identifierStart["springer"]

    buildDir = pubGeneric.makeBuildDir(finalOutDir)

    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(finalOutDir, minId)
    assert chunkSize != None

    # getting filenames from the disk
    diskDir = join(inDir, "disk")
    if int(updateId) == 0 and isdir(diskDir):
        inDiskFiles = parseDiskFnames(diskDir)
    else:
        logging.info("Not first update or no directory %s, not parsing files from springer disk" % diskDir)

    # getting filenames from the updates
    zipDir = join(inDir, "updates")
    inZipFiles = os.listdir(zipDir)
    inZipFiles = [x for x in inZipFiles if x.endswith(".zip")]
    logging.info("Found %d update zip files" % len(inZipFiles))
    # keep order of input files for first run

    if len(alreadyDoneFiles) == 0:
        processFiles = inDiskFiles + inZipFiles
    else:
        processFiles = set(inZipFiles).difference(alreadyDoneFiles)

    if len(processFiles) == 0:
        logging.info("All updates done, not converting anything")
        os.rmdir(buildDir)
        return None
    else:
        logging.info("Total number of files to convert: %d" % (len(processFiles)))

    indexFilename = join(buildDir, "%d_index.tab" % updateId)
    maxArticleId = createIndexFile(zipDir, processFiles, indexFilename, updateId, minId, chunkSize)

    indexSplitDir = join(buildDir, "indexFiles")
    pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir)

    idFname = concatDois(finalOutDir, buildDir, "doneArticles.tab")
    submitJobs(runner, zipDir, indexSplitDir, idFname, buildDir)

    finishUp(buildDir, finalOutDir)

Exemple #2

0

Afficher le fichier

def createChunksSubmitJobs(inDir, finalOutDir, runner, chunkSize):
    """ submit jobs to convert zip and disk files from inDir to outDir
        split files into chunks and submit chunks to cluster system
        write first to temporary dir, and copy over at end of all jobs
        This is based on pubConvElsevier.py
    """
    maxCommon.mustExistDir(finalOutDir)
    minId = pubConf.identifierStart["springer"]

    buildDir = pubGeneric.makeBuildDir(finalOutDir)

    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(finalOutDir, minId)
    assert(chunkSize!=None)

    # getting filenames from the disk
    diskDir = join(inDir, "disk")
    if int(updateId)==0 and isdir(diskDir):
        inDiskFiles = parseDiskFnames(diskDir)
    else:
        logging.info("Not first update or no directory %s, not parsing files from springer disk" % diskDir)

    # getting filenames from the updates
    zipDir = join(inDir, "updates")
    inZipFiles = os.listdir(zipDir)
    inZipFiles = [x for x in inZipFiles if x.endswith(".zip")]
    logging.info("Found %d update zip files" % len(inZipFiles))
    # keep order of input files for first run

    if len(alreadyDoneFiles)==0:
        processFiles = inDiskFiles+inZipFiles
    else:
        processFiles = set(inZipFiles).difference(alreadyDoneFiles)

    if len(processFiles)==0:
        logging.info("All updates done, not converting anything")
        os.rmdir(buildDir)
        return None
    else:
        logging.info("Total number of files to convert: %d" % (len(processFiles)))

    indexFilename = join(buildDir, "%d_index.tab" % updateId)
    maxArticleId  = createIndexFile(zipDir, processFiles, indexFilename, updateId, minId, chunkSize)

    indexSplitDir = join(buildDir, "indexFiles")
    pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir)

    idFname = concatDois(finalOutDir, buildDir, "doneArticles.tab")
    submitJobs(runner, zipDir, indexSplitDir, idFname, buildDir)

    finishUp(buildDir, finalOutDir)

Exemple #3

0

Afficher le fichier

Fichier : pubConvSpringer.py Projet : maximilianh/pubMunch

def finishUp(finalOutDir):
    " do the final post-batch processing "
    buildDir = pubGeneric.makeBuildDir(finalOutDir, mustExist=True)

    minId = pubConf.identifierStart["springer"]

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    # cleanup, move over, remove whole temp dir
    if isdir(indexSplitDir):  # necessary? how could it not be there?
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir)  # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
    pubStore.updateSqlite(finalOutDir)

Exemple #4

0

Afficher le fichier

def finishUp(finalOutDir):
    " do the final post-batch processing "
    buildDir = pubGeneric.makeBuildDir(finalOutDir, mustExist=True)

    minId = pubConf.identifierStart["springer"]

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    # cleanup, move over, remove whole temp dir
    if isdir(indexSplitDir): # necessary? how could it not be there? 
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir) # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)
    pubStore.updateSqlite(finalOutDir)