def hadoopIterate(self, inputHdfsDirectory, outputHdfsDirectory, iteration=0, numberOfReducers=1, cmdenv=None, loggingLevel=logging.WARNING, overwrite=True, verbose=True): """Run a Hadoop iteration, wait for it to finish, and collect the results. This function builds and submits mapper and reducer scripts containing the MapReduceApplication as a serialized class, which are unserialized remotely. @type inputHdfsDirectory: string @param inputHdfsDirectory: The name of the HDFS directory to use as input. It should contain SequenceFiles generated by C{hadoopPopulate}. @type outputHdfsDirectory: string @param outputHdfsDirectory: The name of the HDFS directory to use as output. If it exists and C{overwrite} is True, it will be overwritten. @type iteration: int @param iteration: The iteration number. @type numberOfReducers: int @param numberOfReducers: Desired number of reducers. @type cmdenv: dict or None @param cmdenv: Environment variables to pass to the mapper and reducer processes. @type loggingLevel: logging level @param loggingLevel: The level of log output that will go to Hadoop's standard error. @type overwrite: bool @param overwrite: If C{outputHdfsDirectory} exists and this is True, the contents will be overwritten. @type verbose: bool @param verbose: If True, let Hadoop print its output to C{sys.stdout}. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. @raise IOError: If any I/O related error occurs, this function raises an error. """ self._hadoopCheck() if overwrite: returncode, stdout, stderr = self._hadoopCall(None, "fs", "-test", "-e", outputHdfsDirectory) if returncode == 0: returncode, stdout, stderr = self._hadoopCall(None, "fs", "-rmr", outputHdfsDirectory) if returncode != 0: raise IOError("Could not remove path \"%s\": %s" % (outputHdfsDirectory, stderr)) if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) files = mapRedApp.files if files is None: files = [] if cmdenv is None: cmdenv = {} self.done = False startMetadata = copy.deepcopy(self.metadata) overrideAttributes = {"metadata": startMetadata, "iteration": iteration} overrideAttributesString = pickle.dumps(overrideAttributes, protocol=PICKLE_PROTOCOL) template = os.path.join(os.path.split(__file__)[0], "MapReduceTemplate.py") if not os.path.exists(template): raise IOError("Could not find %s in the Augustus distribution" % template) template = open(template).read() application = os.path.join(os.path.split(__file__)[0], "MapReduceApplication.py") if not os.path.exists(application): raise IOError("Could not find %s in the Augustus distribution" % application) application = open(application).read() mapperScript = tempfile.NamedTemporaryFile(delete=False) mapperScript.write(template) mapperScript.write(application) for name, value in imports.items(): if value is None: mapperScript.write("import %s%s" % (name, os.linesep)) elif isinstance(value, basestring): mapperScript.write("import %s as %s%s" % (name, value, os.linesep)) else: if hasattr(value, "items"): for n, v in value.items(): mapperScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep)) else: for v in value: mapperScript.write("from %s import %s%s" % (name, v, os.linesep)) mapperScript.write("sys.stdout = sys.stderr%s" % os.linesep) mapperScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep)) mapperScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep)) mapperScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep)) mapperScript.write("overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep) mapperScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep) mapperScript.write("overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep)) mapperScript.write("class FakePerformanceTable(object):%s" % os.linesep) mapperScript.write(" def __init__(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def __repr__(self):%s" % os.linesep) mapperScript.write(" return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep) mapperScript.write(" def absorb(self, performanceTable):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def begin(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def end(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def pause(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def unpause(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def block(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def unblock(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write("overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep) mapperScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep)) mapperScript.write("appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep) mapperScript.write("controller = Controller(appClass())%s" % os.linesep) mapperScript.write("controller.mapper()%s" % os.linesep) mapperScript.close() reducerScript = tempfile.NamedTemporaryFile(delete=False) reducerScript.write(template) reducerScript.write(application) for name, value in imports.items(): if value is None: reducerScript.write("import %s%s" % (name, os.linesep)) elif isinstance(value, basestring): reducerScript.write("import %s as %s%s" % (name, value, os.linesep)) else: if hasattr(value, "items"): for n, v in value.items(): reducerScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep)) else: for v in value: reducerScript.write("from %s import %s%s" % (name, v, os.linesep)) reducerScript.write("sys.stdout = sys.stderr%s" % os.linesep) reducerScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep)) reducerScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep)) reducerScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep)) reducerScript.write("overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep) reducerScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep) reducerScript.write("overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep)) reducerScript.write("class FakePerformanceTable(object):%s" % os.linesep) reducerScript.write(" def __init__(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def __repr__(self):%s" % os.linesep) reducerScript.write(" return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep) reducerScript.write(" def absorb(self, performanceTable):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def begin(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def end(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def pause(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def unpause(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def block(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def unblock(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write("overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep) reducerScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep)) reducerScript.write("appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep) reducerScript.write("controller = Controller(appClass())%s" % os.linesep) reducerScript.write("controller.reducer()%s" % os.linesep) reducerScript.close() if verbose: stdout = None else: stdout = subprocess.PIPE fileargs = ["-file"] * (2 * len(files)) fileargs[1::2] = files envargs = ["-cmdenv"] * (2 * len(cmdenv)) envargs[1::2] = ["%s=%s" % (n, v) for n, v in cmdenv.items()] process = subprocess.Popen([self.HADOOP_EXECUTABLE, "jar", self.HADOOP_STREAMING_JAR, "-D", "mapred.reduce.tasks=%d" % numberOfReducers, "-D", "stream.map.output=typedbytes", "-D", "stream.reduce.input=typedbytes", "-D", "stream.reduce.output=typedbytes", "-inputformat", "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat", "-outputformat", "org.apache.hadoop.mapred.SequenceFileOutputFormat", "-input", "%s/*" % inputHdfsDirectory, "-output", outputHdfsDirectory, "-mapper", mapperScript.name, "-reducer", reducerScript.name, "-file", mapperScript.name, "-file", reducerScript.name] + fileargs + envargs, stdout=stdout) process.wait() try: os.remove(mapperScript.name) except OSError: pass try: os.remove(reducerScript.name) except OSError: pass if process.returncode != 0: raise RuntimeError("Hadoop streaming failed") if gatherOutput: outputRecords, outputKeyValues = self.hadoopGather(outputHdfsDirectory) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": None, "logger": logging.getLogger(mapRedApp.loggerName)} appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) appInstance = appClass() if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True self.metadata = appInstance.metadata return outputRecords, outputKeyValues else: self.metadata = startMetadata return [], {}
def hadoopIterate(self, inputHdfsDirectory, outputHdfsDirectory, iteration=0, numberOfReducers=1, cmdenv=None, loggingLevel=logging.WARNING, overwrite=True, verbose=True): """Run a Hadoop iteration, wait for it to finish, and collect the results. This function builds and submits mapper and reducer scripts containing the MapReduceApplication as a serialized class, which are unserialized remotely. @type inputHdfsDirectory: string @param inputHdfsDirectory: The name of the HDFS directory to use as input. It should contain SequenceFiles generated by C{hadoopPopulate}. @type outputHdfsDirectory: string @param outputHdfsDirectory: The name of the HDFS directory to use as output. If it exists and C{overwrite} is True, it will be overwritten. @type iteration: int @param iteration: The iteration number. @type numberOfReducers: int @param numberOfReducers: Desired number of reducers. @type cmdenv: dict or None @param cmdenv: Environment variables to pass to the mapper and reducer processes. @type loggingLevel: logging level @param loggingLevel: The level of log output that will go to Hadoop's standard error. @type overwrite: bool @param overwrite: If C{outputHdfsDirectory} exists and this is True, the contents will be overwritten. @type verbose: bool @param verbose: If True, let Hadoop print its output to C{sys.stdout}. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. @raise IOError: If any I/O related error occurs, this function raises an error. """ self._hadoopCheck() if overwrite: returncode, stdout, stderr = self._hadoopCall( None, "fs", "-test", "-e", outputHdfsDirectory) if returncode == 0: returncode, stdout, stderr = self._hadoopCall( None, "fs", "-rmr", outputHdfsDirectory) if returncode != 0: raise IOError("Could not remove path \"%s\": %s" % (outputHdfsDirectory, stderr)) if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) files = mapRedApp.files if files is None: files = [] if cmdenv is None: cmdenv = {} self.done = False startMetadata = copy.deepcopy(self.metadata) overrideAttributes = { "metadata": startMetadata, "iteration": iteration } overrideAttributesString = pickle.dumps(overrideAttributes, protocol=PICKLE_PROTOCOL) template = os.path.join( os.path.split(__file__)[0], "MapReduceTemplate.py") if not os.path.exists(template): raise IOError("Could not find %s in the Augustus distribution" % template) template = open(template).read() application = os.path.join( os.path.split(__file__)[0], "MapReduceApplication.py") if not os.path.exists(application): raise IOError("Could not find %s in the Augustus distribution" % application) application = open(application).read() mapperScript = tempfile.NamedTemporaryFile(delete=False) mapperScript.write(template) mapperScript.write(application) for name, value in imports.items(): if value is None: mapperScript.write("import %s%s" % (name, os.linesep)) elif isinstance(value, basestring): mapperScript.write("import %s as %s%s" % (name, value, os.linesep)) else: if hasattr(value, "items"): for n, v in value.items(): mapperScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep)) else: for v in value: mapperScript.write("from %s import %s%s" % (name, v, os.linesep)) mapperScript.write("sys.stdout = sys.stderr%s" % os.linesep) mapperScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep)) mapperScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep)) mapperScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep)) mapperScript.write( "overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep) mapperScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep) mapperScript.write( "overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep)) mapperScript.write("class FakePerformanceTable(object):%s" % os.linesep) mapperScript.write(" def __init__(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def __repr__(self):%s" % os.linesep) mapperScript.write( " return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep) mapperScript.write(" def absorb(self, performanceTable):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def begin(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def end(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def pause(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def unpause(self, key):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def block(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write(" def unblock(self):%s" % os.linesep) mapperScript.write(" pass%s" % os.linesep) mapperScript.write( "overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep) mapperScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep)) mapperScript.write( "appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep) mapperScript.write("controller = Controller(appClass())%s" % os.linesep) mapperScript.write("controller.mapper()%s" % os.linesep) mapperScript.close() reducerScript = tempfile.NamedTemporaryFile(delete=False) reducerScript.write(template) reducerScript.write(application) for name, value in imports.items(): if value is None: reducerScript.write("import %s%s" % (name, os.linesep)) elif isinstance(value, basestring): reducerScript.write("import %s as %s%s" % (name, value, os.linesep)) else: if hasattr(value, "items"): for n, v in value.items(): reducerScript.write("from %s import %s as %s%s" % (name, n, v, os.linesep)) else: for v in value: reducerScript.write("from %s import %s%s" % (name, v, os.linesep)) reducerScript.write("sys.stdout = sys.stderr%s" % os.linesep) reducerScript.write("logging.basicConfig(level=%d)%s" % (loggingLevel, os.linesep)) reducerScript.write("logger = logging.getLogger(\"%s\")%s" % (self.loggerName, os.linesep)) reducerScript.write("overrideAttributesString = %r%s" % (overrideAttributesString, os.linesep)) reducerScript.write( "overrideAttributes = pickle.loads(overrideAttributesString)%s" % os.linesep) reducerScript.write("overrideAttributes[\"emit\"] = emit%s" % os.linesep) reducerScript.write( "overrideAttributes[\"logger\"] = logging.getLogger(\"%s\")%s" % (mapRedApp.loggerName, os.linesep)) reducerScript.write("class FakePerformanceTable(object):%s" % os.linesep) reducerScript.write(" def __init__(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def __repr__(self):%s" % os.linesep) reducerScript.write( " return \"<FakePerformanceTable at 0x%%x>\" %% id(self)%s" % os.linesep) reducerScript.write(" def absorb(self, performanceTable):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def begin(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def end(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def pause(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def unpause(self, key):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def block(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write(" def unblock(self):%s" % os.linesep) reducerScript.write(" pass%s" % os.linesep) reducerScript.write( "overrideAttributes[\"performanceTable\"] = FakePerformanceTable()%s" % os.linesep) reducerScript.write("mapRedAppSerialized = %r%s" % (mapRedAppSerialized, os.linesep)) reducerScript.write( "appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, globals())%s" % os.linesep) reducerScript.write("controller = Controller(appClass())%s" % os.linesep) reducerScript.write("controller.reducer()%s" % os.linesep) reducerScript.close() if verbose: stdout = None else: stdout = subprocess.PIPE fileargs = ["-file"] * (2 * len(files)) fileargs[1::2] = files envargs = ["-cmdenv"] * (2 * len(cmdenv)) envargs[1::2] = ["%s=%s" % (n, v) for n, v in cmdenv.items()] process = subprocess.Popen([ self.HADOOP_EXECUTABLE, "jar", self.HADOOP_STREAMING_JAR, "-D", "mapred.reduce.tasks=%d" % numberOfReducers, "-D", "stream.map.output=typedbytes", "-D", "stream.reduce.input=typedbytes", "-D", "stream.reduce.output=typedbytes", "-inputformat", "org.apache.hadoop.mapred.SequenceFileAsBinaryInputFormat", "-outputformat", "org.apache.hadoop.mapred.SequenceFileOutputFormat", "-input", "%s/*" % inputHdfsDirectory, "-output", outputHdfsDirectory, "-mapper", mapperScript.name, "-reducer", reducerScript.name, "-file", mapperScript.name, "-file", reducerScript.name ] + fileargs + envargs, stdout=stdout) process.wait() try: os.remove(mapperScript.name) except OSError: pass try: os.remove(reducerScript.name) except OSError: pass if process.returncode != 0: raise RuntimeError("Hadoop streaming failed") if gatherOutput: outputRecords, outputKeyValues = self.hadoopGather( outputHdfsDirectory) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": None, "logger": logging.getLogger(mapRedApp.loggerName) } appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) appInstance = appClass() if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True self.metadata = appInstance.metadata return outputRecords, outputKeyValues else: self.metadata = startMetadata return [], {}
def iterate(self, inputData, iteration=0, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True): """Run a pure-Python map-reduce iteration. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iteration: int @param iteration: The iteration number. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. """ overheadPerformanceTable = PerformanceTable() self._performanceTables.append(overheadPerformanceTable) overheadPerformanceTable.begin("MapReduce.iterate") if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) self.logger.info("Start iteration %d with %d input records and %d metadata keys.", iteration, len(inputData), len(self.metadata)) self.done = False overheadPerformanceTable.begin("copy metadata") startMetadata = copy.deepcopy(self.metadata) overheadPerformanceTable.end("copy metadata") intermediateData = {} dataLock = threading.Lock() def emit(appself, key, record): with dataLock: newTuple = copy.deepcopy((key, record)) if key in intermediateData: self.logger.debug(" key \"%s\": %r", key, record) intermediateData[key].append(newTuple) else: self.logger.debug("New key \"%s\": %r", key, record) intermediateData[key] = [newTuple] if parallel: mapperThreads = [] recordsPerMapper = int(math.ceil(len(inputData) / float(numberOfMappers))) for number in xrange(numberOfMappers): subData = inputData[(number * recordsPerMapper):((number + 1) * recordsPerMapper)] performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze mapper") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze mapper") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info("Starting mapper %d in parallel with %d input records.", number, len(subData)) mapperThreads.append(threading.Thread(target=controller.mapper, name=("Mapper_%03d" % number), args=(subData,))) else: self.logger.info("Starting mapper %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.mapper(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in mapperThreads: thread.start() for thread in mapperThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All mappers finished.") if sort: self.logger.info("Sorting %d intermediate values.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("sort intermediate data") for value in intermediateData.values(): value.sort() overheadPerformanceTable.end("sort intermediate data") else: self.logger.info("Leaving %d intermediate values in the order in which they were generated.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("load balance") lengths = [(key, len(intermediateData[key])) for key in intermediateData] lengths.sort(lambda a, b: cmp(b[1], a[1])) assignments = [[] for x in xrange(numberOfReducers)] workload = [0] * numberOfReducers for key, length in lengths: index = min((w, i) for i, w in enumerate(workload))[1] # this is argmin(workload) assignments[index].append(key) workload[index] += length if self.logger.isEnabledFor(logging.INFO): for i, (a, w) in enumerate(zip(assignments, workload)): if len(a) > 10: self.logger.info("Assigning %d keys (%d total records) to reducer %d.", len(a), w, i) else: self.logger.info("Assigning keys %s (%d total records) to reducer %d.", ", ".join("\"%s\"" % k for k in a), w, i) overheadPerformanceTable.end("load balance") outputRecords = [] outputKeyValues = {} dataLock = threading.Lock() def emit(appself, key, record): if key is None: self.logger.debug("OutputRecord: %r", record) outputRecords.append(record) else: with dataLock: if key in outputKeyValues: raise RuntimeError("Two reducers are trying to write to the same metadata key: \"%s\"" % key) else: self.logger.debug("OutputKeyValue \"%s\": %r", key, record) outputKeyValues[key] = record if parallel: reducerThreads = [] for number in xrange(numberOfReducers): subData = [] for key in assignments[number]: subData.extend(intermediateData[key]) performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze reducer") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze reducer") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info("Starting reducer %d in parallel with %d input records.", number, len(subData)) reducerThreads.append(threading.Thread(target=controller.reducer, name=("Reducer_%03d" % number), args=(subData,))) else: self.logger.info("Starting reducer %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.reducer(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in reducerThreads: thread.start() for thread in reducerThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All reducers finished.") self.logger.info("Finished iteration %s with %d output records and %d metadata keys.", iteration, len(outputRecords), len(outputKeyValues)) if gatherOutput: performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": None, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze endIteration") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze endIteration") appInstance = appClass() else: appInstance = mapRedApp() appInstance.__dict__.update(overrideAttributes) overheadPerformanceTable.pause("MapReduce.iterate") if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True overheadPerformanceTable.unpause("MapReduce.iterate") self.metadata = appInstance.metadata overheadPerformanceTable.end("MapReduce.iterate") return outputRecords, outputKeyValues else: self.metadata = startMetadata overheadPerformanceTable.end("MapReduce.iterate") return [], {}
def iterate(self, inputData, iteration=0, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True): """Run a pure-Python map-reduce iteration. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iteration: int @param iteration: The iteration number. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. """ overheadPerformanceTable = PerformanceTable() self._performanceTables.append(overheadPerformanceTable) overheadPerformanceTable.begin("MapReduce.iterate") if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) self.logger.info( "Start iteration %d with %d input records and %d metadata keys.", iteration, len(inputData), len(self.metadata)) self.done = False overheadPerformanceTable.begin("copy metadata") startMetadata = copy.deepcopy(self.metadata) overheadPerformanceTable.end("copy metadata") intermediateData = {} dataLock = threading.Lock() def emit(appself, key, record): with dataLock: newTuple = copy.deepcopy((key, record)) if key in intermediateData: self.logger.debug(" key \"%s\": %r", key, record) intermediateData[key].append(newTuple) else: self.logger.debug("New key \"%s\": %r", key, record) intermediateData[key] = [newTuple] if parallel: mapperThreads = [] recordsPerMapper = int( math.ceil(len(inputData) / float(numberOfMappers))) for number in xrange(numberOfMappers): subData = inputData[(number * recordsPerMapper):((number + 1) * recordsPerMapper)] performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze mapper") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze mapper") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType( overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info( "Starting mapper %d in parallel with %d input records.", number, len(subData)) mapperThreads.append( threading.Thread(target=controller.mapper, name=("Mapper_%03d" % number), args=(subData, ))) else: self.logger.info( "Starting mapper %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.mapper(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in mapperThreads: thread.start() for thread in mapperThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All mappers finished.") if sort: self.logger.info("Sorting %d intermediate values.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("sort intermediate data") for value in intermediateData.values(): value.sort() overheadPerformanceTable.end("sort intermediate data") else: self.logger.info( "Leaving %d intermediate values in the order in which they were generated.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("load balance") lengths = [(key, len(intermediateData[key])) for key in intermediateData] lengths.sort(lambda a, b: cmp(b[1], a[1])) assignments = [[] for x in xrange(numberOfReducers)] workload = [0] * numberOfReducers for key, length in lengths: index = min( (w, i) for i, w in enumerate(workload))[1] # this is argmin(workload) assignments[index].append(key) workload[index] += length if self.logger.isEnabledFor(logging.INFO): for i, (a, w) in enumerate(zip(assignments, workload)): if len(a) > 10: self.logger.info( "Assigning %d keys (%d total records) to reducer %d.", len(a), w, i) else: self.logger.info( "Assigning keys %s (%d total records) to reducer %d.", ", ".join("\"%s\"" % k for k in a), w, i) overheadPerformanceTable.end("load balance") outputRecords = [] outputKeyValues = {} dataLock = threading.Lock() def emit(appself, key, record): if key is None: self.logger.debug("OutputRecord: %r", record) outputRecords.append(record) else: with dataLock: if key in outputKeyValues: raise RuntimeError( "Two reducers are trying to write to the same metadata key: \"%s\"" % key) else: self.logger.debug("OutputKeyValue \"%s\": %r", key, record) outputKeyValues[key] = record if parallel: reducerThreads = [] for number in xrange(numberOfReducers): subData = [] for key in assignments[number]: subData.extend(intermediateData[key]) performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze reducer") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze reducer") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType( overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info( "Starting reducer %d in parallel with %d input records.", number, len(subData)) reducerThreads.append( threading.Thread(target=controller.reducer, name=("Reducer_%03d" % number), args=(subData, ))) else: self.logger.info( "Starting reducer %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.reducer(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in reducerThreads: thread.start() for thread in reducerThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All reducers finished.") self.logger.info( "Finished iteration %s with %d output records and %d metadata keys.", iteration, len(outputRecords), len(outputKeyValues)) if gatherOutput: performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": None, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze endIteration") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze endIteration") appInstance = appClass() else: appInstance = mapRedApp() appInstance.__dict__.update(overrideAttributes) overheadPerformanceTable.pause("MapReduce.iterate") if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True overheadPerformanceTable.unpause("MapReduce.iterate") self.metadata = appInstance.metadata overheadPerformanceTable.end("MapReduce.iterate") return outputRecords, outputKeyValues else: self.metadata = startMetadata overheadPerformanceTable.end("MapReduce.iterate") return [], {}