コード例 #1
0
ファイル: pubAlg.py プロジェクト: maximilianh/pubMunch
def submitProcessRow(runner, algName, inDir, outDir, paramDict):
    inFnames = pubGeneric.findFiles(inDir, [".tab.gz"])
    paramFname = join(runner.batchDir, "pubAlgParams.marshal.gz")
    writeParamDict(paramDict, paramFname)

    for relDir, fname in inFnames:
        fname = abspath(fname)
        outFname = join(abspath(outDir), basename(fname))
        command = "%s %s %s %s %s {check out exists %s} %s" % \
        (sys.executable, __file__ , algName, "processRow", fname, outFname, paramFname)
        runner.submit(command)
    runner.finish()
コード例 #2
0
ファイル: pubAlg.py プロジェクト: maximilianh/pubMunch
def submitCombine(runner, algName, mapReduceDir, outExt, paramDict, pieceCount):
    " submits combiner jobs: they get a list of dicts and output a single dict "
    inFnames = pubGeneric.findFiles(mapReduceDir, [MAPREDUCEEXT])
    random.shuffle(inFnames)
    parts = splitList(inFnames, pieceCount)
    partFnames = writeParts(parts, runner.batchDir)

    paramFname = join(runner.batchDir, "mapReduceParams.marshal.gz")
    writeParamDict(paramDict, paramFname)
    for fname in partFnames:
        inBase   = splitext(basename(fname))[0]
        outFullname = join(mapReduceDir, inBase+".combined."+MAPREDUCEEXT.replace(".gz",""))
        command = "%s %s %s %s %s {check out exists %s} %s" % \
                (sys.executable, __file__ , algName, "combine", fname, outFullname, paramFname)
        runner.submit(command)
    runner.finish()
コード例 #3
0
ファイル: pubAlg.py プロジェクト: maximilianh/pubMunch
def concatFiles(inDir, outFname):
    " concat all files in outDir and write to outFname. "
    logging.info("Looking for tab.gz files in %s" % inDir)
    inFnames = pubGeneric.findFiles(inDir, ".tab.gz")
    ofh = open(outFname, "w")
    pm = maxCommon.ProgressMeter(len(inFnames))
    logging.info("Concatting...")
    fno = 0
    for relDir, fn in inFnames:
        lno = 0
        for line in gzip.open(fn):
            if lno==0 and fno==0:
                ofh.write(line)
            if lno!=0:
                ofh.write(line)
            lno += 1
        pm.taskCompleted()
        fno += 1
    ofh.close()
コード例 #4
0
ファイル: pubAlg.py プロジェクト: maximilianh/pubMunch
def runReduce(algName, paramDict, path, outFilename, quiet=False, inFnames=None):
    """ parse pickled dicts from path, run through reduce function of alg and
    write output to one file """

    if outFilename!=None and isfile(outFilename):
        logging.info("deleting existing file %s" % outFilename)
        os.remove(outFilename)

    if isinstance(algName, basestring):
        alg = getAlg(algName, defClass="Map")
    else:
        alg = algName

    if "map" not in dir(alg):
        logging.error("There is not map() function in %s" % algName)
        sys.exit(1)

    if "startup" in dir(alg):
        alg.startup(paramDict, {})

    if inFnames!=None:
        infiles = inFnames
    elif isfile(path):
        logging.debug("Filename specified, running only on a single file (debugging)")
        infiles = [(dirname(path), path)]
    else:
        infiles = pubGeneric.findFiles(path, [MAPREDUCEEXT])

    if len(infiles)==0:
        logging.error("Could not find any %s files in %s" % (MAPREDUCEEXT, path))
        sys.exit(1)

    # read pickle files into data dict
    data = {}
    fileCount = 0
    logging.info("Reading map output")
    meter = maxCommon.ProgressMeter(len(infiles), quiet=quiet, stepCount=100)
    for relDir, fileName in infiles:
        binData = gzip.open(fileName, "rb").read()
        nodeData = marshal.loads(binData)
        del binData
        for key, values in nodeData.iteritems():
            if not hasattr(values, "__iter__"):
                values = [values]
            # major change: append instead of extend
            # will break existing mr-scripts
            data.setdefault(key, []).append(values)
        fileCount += 1
        logging.debug("Reading "+fileName)
        meter.taskCompleted()

    logging.info("Writing to %s" % outFilename)
    if outFilename==None:
        ofh = None
    elif outFilename=="stdout":
        ofh = sys.stdout
    else:
        ofh = open(outFilename, "w")

    if "headers" in dir(alg) and ofh!=None:
        ofh.write("\t".join(alg.headers))
        ofh.write("\n")

    if "reduceStartup" in dir(alg):
        logging.info("Running reduceStartup")
        alg.reduceStartup(data, paramDict, ofh)

    logging.info("Running data through reducer")
    meter = maxCommon.ProgressMeter(len(data))
    for key, valList in data.iteritems():
        tupleIterator = alg.reduce(key, valList)
        for tuple in tupleIterator:
            if tuple==None:
                logging.debug("Got None, not writing anything")
                continue
            if type(tuple)==bytes: # make sure that returned value is a list
                tuple = [tuple]
            if type(tuple)==int: # make sure that it's a string
                tuple = [str(tuple)]
            tuple = [unicode(x).encode("utf8") for x in tuple] # convert to utf8
            if ofh!=None:
                ofh.write("\t".join(tuple))
                ofh.write("\n")
        meter.taskCompleted()
    if ofh!=None:
        ofh.close()

    if "reduceEnd" in dir(alg):
        logging.info("Running reduceEnd")
        alg.reduceEnd(data)