def submitProcessRow(runner, algName, inDir, outDir, paramDict): inFnames = pubGeneric.findFiles(inDir, [".tab.gz"]) paramFname = join(runner.batchDir, "pubAlgParams.marshal.gz") writeParamDict(paramDict, paramFname) for relDir, fname in inFnames: fname = abspath(fname) outFname = join(abspath(outDir), basename(fname)) command = "%s %s %s %s %s {check out exists %s} %s" % \ (sys.executable, __file__ , algName, "processRow", fname, outFname, paramFname) runner.submit(command) runner.finish()
def submitCombine(runner, algName, mapReduceDir, outExt, paramDict, pieceCount): " submits combiner jobs: they get a list of dicts and output a single dict " inFnames = pubGeneric.findFiles(mapReduceDir, [MAPREDUCEEXT]) random.shuffle(inFnames) parts = splitList(inFnames, pieceCount) partFnames = writeParts(parts, runner.batchDir) paramFname = join(runner.batchDir, "mapReduceParams.marshal.gz") writeParamDict(paramDict, paramFname) for fname in partFnames: inBase = splitext(basename(fname))[0] outFullname = join(mapReduceDir, inBase+".combined."+MAPREDUCEEXT.replace(".gz","")) command = "%s %s %s %s %s {check out exists %s} %s" % \ (sys.executable, __file__ , algName, "combine", fname, outFullname, paramFname) runner.submit(command) runner.finish()
def concatFiles(inDir, outFname): " concat all files in outDir and write to outFname. " logging.info("Looking for tab.gz files in %s" % inDir) inFnames = pubGeneric.findFiles(inDir, ".tab.gz") ofh = open(outFname, "w") pm = maxCommon.ProgressMeter(len(inFnames)) logging.info("Concatting...") fno = 0 for relDir, fn in inFnames: lno = 0 for line in gzip.open(fn): if lno==0 and fno==0: ofh.write(line) if lno!=0: ofh.write(line) lno += 1 pm.taskCompleted() fno += 1 ofh.close()
def runReduce(algName, paramDict, path, outFilename, quiet=False, inFnames=None): """ parse pickled dicts from path, run through reduce function of alg and write output to one file """ if outFilename!=None and isfile(outFilename): logging.info("deleting existing file %s" % outFilename) os.remove(outFilename) if isinstance(algName, basestring): alg = getAlg(algName, defClass="Map") else: alg = algName if "map" not in dir(alg): logging.error("There is not map() function in %s" % algName) sys.exit(1) if "startup" in dir(alg): alg.startup(paramDict, {}) if inFnames!=None: infiles = inFnames elif isfile(path): logging.debug("Filename specified, running only on a single file (debugging)") infiles = [(dirname(path), path)] else: infiles = pubGeneric.findFiles(path, [MAPREDUCEEXT]) if len(infiles)==0: logging.error("Could not find any %s files in %s" % (MAPREDUCEEXT, path)) sys.exit(1) # read pickle files into data dict data = {} fileCount = 0 logging.info("Reading map output") meter = maxCommon.ProgressMeter(len(infiles), quiet=quiet, stepCount=100) for relDir, fileName in infiles: binData = gzip.open(fileName, "rb").read() nodeData = marshal.loads(binData) del binData for key, values in nodeData.iteritems(): if not hasattr(values, "__iter__"): values = [values] # major change: append instead of extend # will break existing mr-scripts data.setdefault(key, []).append(values) fileCount += 1 logging.debug("Reading "+fileName) meter.taskCompleted() logging.info("Writing to %s" % outFilename) if outFilename==None: ofh = None elif outFilename=="stdout": ofh = sys.stdout else: ofh = open(outFilename, "w") if "headers" in dir(alg) and ofh!=None: ofh.write("\t".join(alg.headers)) ofh.write("\n") if "reduceStartup" in dir(alg): logging.info("Running reduceStartup") alg.reduceStartup(data, paramDict, ofh) logging.info("Running data through reducer") meter = maxCommon.ProgressMeter(len(data)) for key, valList in data.iteritems(): tupleIterator = alg.reduce(key, valList) for tuple in tupleIterator: if tuple==None: logging.debug("Got None, not writing anything") continue if type(tuple)==bytes: # make sure that returned value is a list tuple = [tuple] if type(tuple)==int: # make sure that it's a string tuple = [str(tuple)] tuple = [unicode(x).encode("utf8") for x in tuple] # convert to utf8 if ofh!=None: ofh.write("\t".join(tuple)) ofh.write("\n") meter.taskCompleted() if ofh!=None: ofh.close() if "reduceEnd" in dir(alg): logging.info("Running reduceEnd") alg.reduceEnd(data)