Example #1
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir)
    #maxCommon.delOnExit(tmpDir)

    maxCommon.mustExistDir(outDir)
    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    # get all .gz.index files, remove the already done files
    inFnames = glob.glob(join(inDir, "*.index.gz"))
    inBaseNames = set([basename(x) for x in inFnames])
    todoBasenames = inBaseNames - set(alreadyDoneFiles)
    todoFnames = [join(inDir, x) for x in todoBasenames]
    if len(todoFnames) == 0:
        logging.info("All input files already converted")
        return

    indexFilename = join(outDir, "%d_index.tab" % updateId)
    indexFile = open(indexFilename, "w")
    headers = ["articleId", "tsvFile", "url", "offset"]
    indexFile.write("\t".join(headers))
    indexFile.write("\n")

    # read them and create a big index file:
    # with tsvname, url, offset
    numId = minId
    doneUrls = set()
    for fname in todoFnames:
        baseName = basename(fname)
        for line in gzip.open(fname):
            url, offset = line.rstrip("\n").split("\t")
            assert (offset.isdigit())
            if "\t" in url or "\n" in url:
                logging.info("tab or NL in url %s, skipping" % url)
                continue
            if url in doneUrls:
                logging.info("Already did %s" % url)
                continue
            baseName = baseName.replace(".index.gz", ".gz")
            row = [str(numId), baseName, url, offset]
            indexFile.write("\t".join(row))
            indexFile.write("\n")
            numId += 1
    indexFile.close()

    # split the index file into chunks, one per job
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename,
                                              tmpDir,
                                              chunkSize=chunkSize)
    idFname = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab")
    # submit one conversion job per chunk
    submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname,
                      tmpDir)
    pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId)
    pubStore.moveFiles(tmpDir, outDir)
    shutil.rmtree(tmpDir)
    pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
Example #2
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    tmpDir = pubGeneric.makeTempDir("bingData", tmpDir=outDir)
    #maxCommon.delOnExit(tmpDir)

    maxCommon.mustExistDir(outDir)
    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    # get all .gz.index files, remove the already done files
    inFnames = glob.glob(join(inDir, "*.index.gz"))
    inBaseNames = set([basename(x) for x in inFnames])
    todoBasenames = inBaseNames - set(alreadyDoneFiles)
    todoFnames = [join(inDir, x) for x in todoBasenames]
    if len(todoFnames)==0:
        logging.info("All input files already converted")
        return

    indexFilename = join(outDir, "%d_index.tab" % updateId)
    indexFile = open(indexFilename, "w")
    headers = ["articleId", "tsvFile", "url", "offset"]
    indexFile.write("\t".join(headers))
    indexFile.write("\n")

    # read them and create a big index file:
    # with tsvname, url, offset
    numId = minId
    doneUrls = set()
    for fname in todoFnames:
        baseName = basename(fname)
        for line in gzip.open(fname):
            url, offset = line.rstrip("\n").split("\t")
            assert(offset.isdigit())
            if "\t" in url or "\n" in url:
                logging.info("tab or NL in url %s, skipping" % url)
                continue
            if url in doneUrls:
                logging.info("Already did %s" % url)
                continue
            baseName = baseName.replace(".index.gz", ".gz")
            row = [str(numId), baseName, url, offset]
            indexFile.write("\t".join(row))
            indexFile.write("\n")
            numId+=1
    indexFile.close()

    # split the index file into chunks, one per job
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, tmpDir, chunkSize=chunkSize)
    idFname  = pubGeneric.concatIdentifiers(outDir, tmpDir, "doneArticles.tab")
    # submit one conversion job per chunk
    submitConvertJobs(runner, inDir, updateId, chunkIds, tmpDir, idFname, tmpDir)
    pubGeneric.concatDelIdFiles(tmpDir, outDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(tmpDir, outDir, "%d.log" % updateId)
    pubStore.moveFiles(tmpDir, outDir)
    shutil.rmtree(tmpDir)
    pubStore.appendToUpdatesTxt(outDir, updateId, numId, todoBasenames)
Example #3
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    """ convert Consyn ZIP files from inDir to outDir 
        split files into chunks and submit chunks to cluster system
        write first to temporary dir, and copy over at end of all jobs
    """
    maxCommon.mustExistDir(outDir)

    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    if chunkSize == None:
        chunkSize = pubStore.guessChunkSize(outDir)
    assert (chunkSize != None)

    # build into temporary dir, fail if it exists
    # it should not exist, otherwise something is wrong
    finalOutDir = outDir
    #outDir     = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId))
    buildDir = join(outDir, "build")
    os.mkdir(buildDir)

    inFiles = os.listdir(inDir)
    inFiles = [x for x in inFiles if x.endswith(".ZIP")]
    # keep order of input of input files for first run
    if len(alreadyDoneFiles) != 0:
        processFiles = set(inFiles).difference(alreadyDoneFiles)
    else:
        processFiles = inFiles

    if len(processFiles) == 0:
        logging.info("All updates done, not converting anything")
        os.rmdir(buildDir)
        return None

    indexFilename = join(buildDir, "%d_index.tab" % updateId)
    maxArticleId = createIndexFile(inDir, processFiles, indexFilename,
                                   updateId, minId, chunkSize)
    indexSplitDir = indexFilename + ".tmp.split"
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir)
    idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir,
                                           "doneArticles.tab")

    submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir)

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    if isdir(indexSplitDir):  # necessary? how could it not be there?
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir)  # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId,
                                processFiles)
Example #4
0
def createChunksSubmitJobs(inDir, outDir, minId, runner, chunkSize):
    """ convert Consyn ZIP files from inDir to outDir 
        split files into chunks and submit chunks to cluster system
        write first to temporary dir, and copy over at end of all jobs
    """
    maxCommon.mustExistDir(outDir)

    updateId, minId, alreadyDoneFiles = pubStore.parseUpdatesTab(outDir, minId)
    if chunkSize==None:
        chunkSize  = pubStore.guessChunkSize(outDir)
    assert(chunkSize!=None)

    # build into temporary dir, fail if it exists
    # it should not exist, otherwise something is wrong
    finalOutDir= outDir
    #outDir     = tempfile.mktemp(dir = outDir, prefix = "elsevierUpdate%s.tmp." % str(updateId))
    buildDir     = join(outDir, "build")
    os.mkdir(buildDir)

    inFiles = os.listdir(inDir)
    inFiles = [x for x in inFiles if x.endswith(".ZIP")]
    # keep order of input of input files for first run
    if len(alreadyDoneFiles)!=0:
        processFiles = set(inFiles).difference(alreadyDoneFiles)
    else:
        processFiles = inFiles

    if len(processFiles)==0:
        logging.info("All updates done, not converting anything")
        os.rmdir(buildDir)
        return None

    indexFilename = join(buildDir, "%d_index.tab" % updateId)
    maxArticleId  = createIndexFile(inDir, processFiles, indexFilename, updateId, minId, chunkSize)
    indexSplitDir = indexFilename+".tmp.split"
    chunkIds = pubStore.splitTabFileOnChunkId(indexFilename, indexSplitDir)
    idFname = pubGeneric.concatIdentifiers(finalOutDir, indexSplitDir, "doneArticles.tab")

    submitJobs(runner, inDir, chunkIds, indexSplitDir, idFname, buildDir)

    pubGeneric.concatDelIdFiles(buildDir, finalOutDir, "%d_ids.tab" % updateId)
    pubGeneric.concatDelLogs(buildDir, finalOutDir, "%d.log" % updateId)

    if isdir(indexSplitDir): # necessary? how could it not be there? 
        logging.info("Deleting directory %s" % indexSplitDir)
        shutil.rmtree(indexSplitDir) # got sometimes exception here...
    pubStore.moveFiles(buildDir, finalOutDir)
    shutil.rmtree(buildDir)

    pubStore.appendToUpdatesTxt(finalOutDir, updateId, maxArticleId, processFiles)