def iterate(self, inputData, iteration=0, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True): """Run a pure-Python map-reduce iteration. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iteration: int @param iteration: The iteration number. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. """ overheadPerformanceTable = PerformanceTable() self._performanceTables.append(overheadPerformanceTable) overheadPerformanceTable.begin("MapReduce.iterate") if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) self.logger.info("Start iteration %d with %d input records and %d metadata keys.", iteration, len(inputData), len(self.metadata)) self.done = False overheadPerformanceTable.begin("copy metadata") startMetadata = copy.deepcopy(self.metadata) overheadPerformanceTable.end("copy metadata") intermediateData = {} dataLock = threading.Lock() def emit(appself, key, record): with dataLock: newTuple = copy.deepcopy((key, record)) if key in intermediateData: self.logger.debug(" key \"%s\": %r", key, record) intermediateData[key].append(newTuple) else: self.logger.debug("New key \"%s\": %r", key, record) intermediateData[key] = [newTuple] if parallel: mapperThreads = [] recordsPerMapper = int(math.ceil(len(inputData) / float(numberOfMappers))) for number in xrange(numberOfMappers): subData = inputData[(number * recordsPerMapper):((number + 1) * recordsPerMapper)] performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze mapper") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze mapper") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info("Starting mapper %d in parallel with %d input records.", number, len(subData)) mapperThreads.append(threading.Thread(target=controller.mapper, name=("Mapper_%03d" % number), args=(subData,))) else: self.logger.info("Starting mapper %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.mapper(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in mapperThreads: thread.start() for thread in mapperThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All mappers finished.") if sort: self.logger.info("Sorting %d intermediate values.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("sort intermediate data") for value in intermediateData.values(): value.sort() overheadPerformanceTable.end("sort intermediate data") else: self.logger.info("Leaving %d intermediate values in the order in which they were generated.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("load balance") lengths = [(key, len(intermediateData[key])) for key in intermediateData] lengths.sort(lambda a, b: cmp(b[1], a[1])) assignments = [[] for x in xrange(numberOfReducers)] workload = [0] * numberOfReducers for key, length in lengths: index = min((w, i) for i, w in enumerate(workload))[1] # this is argmin(workload) assignments[index].append(key) workload[index] += length if self.logger.isEnabledFor(logging.INFO): for i, (a, w) in enumerate(zip(assignments, workload)): if len(a) > 10: self.logger.info("Assigning %d keys (%d total records) to reducer %d.", len(a), w, i) else: self.logger.info("Assigning keys %s (%d total records) to reducer %d.", ", ".join("\"%s\"" % k for k in a), w, i) overheadPerformanceTable.end("load balance") outputRecords = [] outputKeyValues = {} dataLock = threading.Lock() def emit(appself, key, record): if key is None: self.logger.debug("OutputRecord: %r", record) outputRecords.append(record) else: with dataLock: if key in outputKeyValues: raise RuntimeError("Two reducers are trying to write to the same metadata key: \"%s\"" % key) else: self.logger.debug("OutputKeyValue \"%s\": %r", key, record) outputKeyValues[key] = record if parallel: reducerThreads = [] for number in xrange(numberOfReducers): subData = [] for key in assignments[number]: subData.extend(intermediateData[key]) performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze reducer") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze reducer") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType(overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info("Starting reducer %d in parallel with %d input records.", number, len(subData)) reducerThreads.append(threading.Thread(target=controller.reducer, name=("Reducer_%03d" % number), args=(subData,))) else: self.logger.info("Starting reducer %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.reducer(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in reducerThreads: thread.start() for thread in reducerThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All reducers finished.") self.logger.info("Finished iteration %s with %d output records and %d metadata keys.", iteration, len(outputRecords), len(outputKeyValues)) if gatherOutput: performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = {"metadata": startMetadata, "iteration": iteration, "emit": None, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName)} if frozenClass: overheadPerformanceTable.begin("unfreeze endIteration") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze endIteration") appInstance = appClass() else: appInstance = mapRedApp() appInstance.__dict__.update(overrideAttributes) overheadPerformanceTable.pause("MapReduce.iterate") if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True overheadPerformanceTable.unpause("MapReduce.iterate") self.metadata = appInstance.metadata overheadPerformanceTable.end("MapReduce.iterate") return outputRecords, outputKeyValues else: self.metadata = startMetadata overheadPerformanceTable.end("MapReduce.iterate") return [], {}
def iterate(self, inputData, iteration=0, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True): """Run a pure-Python map-reduce iteration. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iteration: int @param iteration: The iteration number. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @rtype: 2-tuple of list, dict @return: List of output records and dictionary of output key-value pairs. """ overheadPerformanceTable = PerformanceTable() self._performanceTables.append(overheadPerformanceTable) overheadPerformanceTable.begin("MapReduce.iterate") if iteration < len(self.mapRedApps): mapRedApp = self.mapRedApps[iteration] mapRedAppSerialized = self.mapRedAppsSerialized[iteration] else: mapRedApp = self.mapRedApps[-1] mapRedAppSerialized = self.mapRedAppsSerialized[-1] gatherOutput = mapRedApp.gatherOutput imports = mapRedApp.imports if imports is None: imports = {} namespace = self._buildNamespace(imports) self.logger.info( "Start iteration %d with %d input records and %d metadata keys.", iteration, len(inputData), len(self.metadata)) self.done = False overheadPerformanceTable.begin("copy metadata") startMetadata = copy.deepcopy(self.metadata) overheadPerformanceTable.end("copy metadata") intermediateData = {} dataLock = threading.Lock() def emit(appself, key, record): with dataLock: newTuple = copy.deepcopy((key, record)) if key in intermediateData: self.logger.debug(" key \"%s\": %r", key, record) intermediateData[key].append(newTuple) else: self.logger.debug("New key \"%s\": %r", key, record) intermediateData[key] = [newTuple] if parallel: mapperThreads = [] recordsPerMapper = int( math.ceil(len(inputData) / float(numberOfMappers))) for number in xrange(numberOfMappers): subData = inputData[(number * recordsPerMapper):((number + 1) * recordsPerMapper)] performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze mapper") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze mapper") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType( overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info( "Starting mapper %d in parallel with %d input records.", number, len(subData)) mapperThreads.append( threading.Thread(target=controller.mapper, name=("Mapper_%03d" % number), args=(subData, ))) else: self.logger.info( "Starting mapper %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.mapper(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in mapperThreads: thread.start() for thread in mapperThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All mappers finished.") if sort: self.logger.info("Sorting %d intermediate values.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("sort intermediate data") for value in intermediateData.values(): value.sort() overheadPerformanceTable.end("sort intermediate data") else: self.logger.info( "Leaving %d intermediate values in the order in which they were generated.", sum(len(x) for x in intermediateData.values())) overheadPerformanceTable.begin("load balance") lengths = [(key, len(intermediateData[key])) for key in intermediateData] lengths.sort(lambda a, b: cmp(b[1], a[1])) assignments = [[] for x in xrange(numberOfReducers)] workload = [0] * numberOfReducers for key, length in lengths: index = min( (w, i) for i, w in enumerate(workload))[1] # this is argmin(workload) assignments[index].append(key) workload[index] += length if self.logger.isEnabledFor(logging.INFO): for i, (a, w) in enumerate(zip(assignments, workload)): if len(a) > 10: self.logger.info( "Assigning %d keys (%d total records) to reducer %d.", len(a), w, i) else: self.logger.info( "Assigning keys %s (%d total records) to reducer %d.", ", ".join("\"%s\"" % k for k in a), w, i) overheadPerformanceTable.end("load balance") outputRecords = [] outputKeyValues = {} dataLock = threading.Lock() def emit(appself, key, record): if key is None: self.logger.debug("OutputRecord: %r", record) outputRecords.append(record) else: with dataLock: if key in outputKeyValues: raise RuntimeError( "Two reducers are trying to write to the same metadata key: \"%s\"" % key) else: self.logger.debug("OutputKeyValue \"%s\": %r", key, record) outputKeyValues[key] = record if parallel: reducerThreads = [] for number in xrange(numberOfReducers): subData = [] for key in assignments[number]: subData.extend(intermediateData[key]) performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": emit, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze reducer") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze reducer") controller = self.Controller(appClass()) else: appInstance = mapRedApp() overrideAttributes["emit"] = types.MethodType( overrideAttributes["emit"], appInstance) appInstance.__dict__.update(overrideAttributes) controller = self.Controller(appInstance) if parallel: self.logger.info( "Starting reducer %d in parallel with %d input records.", number, len(subData)) reducerThreads.append( threading.Thread(target=controller.reducer, name=("Reducer_%03d" % number), args=(subData, ))) else: self.logger.info( "Starting reducer %d in series with %d input records.", number, len(subData)) overheadPerformanceTable.pause("MapReduce.iterate") controller.reducer(subData) overheadPerformanceTable.unpause("MapReduce.iterate") if parallel: overheadPerformanceTable.pause("MapReduce.iterate") for thread in reducerThreads: thread.start() for thread in reducerThreads: thread.join() overheadPerformanceTable.unpause("MapReduce.iterate") self.logger.info("All reducers finished.") self.logger.info( "Finished iteration %s with %d output records and %d metadata keys.", iteration, len(outputRecords), len(outputKeyValues)) if gatherOutput: performanceTable = PerformanceTable() self._performanceTables.append(performanceTable) overrideAttributes = { "metadata": startMetadata, "iteration": iteration, "emit": None, "performanceTable": performanceTable, "logger": logging.getLogger(mapRedApp.loggerName) } if frozenClass: overheadPerformanceTable.begin("unfreeze endIteration") appClass = unserializeClass(mapRedAppSerialized, MapReduceApplication, overrideAttributes, namespace) overheadPerformanceTable.end("unfreeze endIteration") appInstance = appClass() else: appInstance = mapRedApp() appInstance.__dict__.update(overrideAttributes) overheadPerformanceTable.pause("MapReduce.iterate") if appInstance.endIteration(outputRecords, outputKeyValues): self.done = True overheadPerformanceTable.unpause("MapReduce.iterate") self.metadata = appInstance.metadata overheadPerformanceTable.end("MapReduce.iterate") return outputRecords, outputKeyValues else: self.metadata = startMetadata overheadPerformanceTable.end("MapReduce.iterate") return [], {}
def run(self, inputData, iterationLimit=None, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True, performanceTable=None): """Run the whole pure-Python map-reduce workflow, which may involve multiple iterations of map-reduce. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iterationLimit: int @param iterationLimit: The highest iteration to allow, with 1 being the first iteration. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: 3-tuple of list, dict, int @return: List of output records, dictionary of output key-value pairs, and the final iteration number. """ self.logger.info("Start run with %d input records and %d cache keys.", len(inputData), len(self.metadata)) overheadPerformanceTable = PerformanceTable() if performanceTable is None: self._performanceTables = [overheadPerformanceTable] else: self._performanceTables = [performanceTable, overheadPerformanceTable] startTime = time.time() overheadPerformanceTable.begin("MapReduce.run") iteration = 0 while True: if iteration < len(self.chain): chain = self.chain[iteration] else: chain = self.chain[-1] overheadPerformanceTable.pause("MapReduce.run") outputRecords, outputKeyValues = self.iterate(inputData, iteration, sort, parallel, numberOfMappers, numberOfReducers, frozenClass) overheadPerformanceTable.unpause("MapReduce.run") if self.done: break if chain: inputData = outputRecords self.logger.info("Chaining output to next iteration (%d intermediate records).", len(outputRecords)) else: self.logger.info("Ignoring iteration output (%d ignored records).", len(outputRecords)) iteration += 1 if iterationLimit is not None and iteration >= iterationLimit: self.logger.info("Reached iteration limit of %d.", iterationLimit) self.done = True break overheadPerformanceTable.end("MapReduce.run") endTime = time.time() if self.logger.isEnabledFor(logging.INFO): stream = StringIO() stream.write("%sPerformanceTable (concurrent times are added together; results are in CPU-seconds):%s%s" % (os.linesep, os.linesep, os.linesep)) self.performanceTable.look(stream=stream) stream.write("%sTotal wall time, including any concurrency: %g%s" % (os.linesep, endTime - startTime, os.linesep)) self.logger.info(stream.getvalue()) self.logger.info("Finished run with %d output records.", len(outputRecords)) return outputRecords, outputKeyValues, iteration
def run(self, inputData, iterationLimit=None, sort=False, parallel=False, numberOfMappers=1, numberOfReducers=1, frozenClass=True, performanceTable=None): """Run the whole pure-Python map-reduce workflow, which may involve multiple iterations of map-reduce. @type inputData: list of Python objects @param inputData: The objects to use as a data stream. @type iterationLimit: int @param iterationLimit: The highest iteration to allow, with 1 being the first iteration. @type sort: bool @param sort: If True, perform a sorting step between the mapper and the reducer. @type parallel: bool @param parallel: If True, run the independent mappers and independent reducers as distinct threads. @type numberOfMappers: int @param numberOfMappers: Requested number of mappers. Input data will be divided evenly among them. @type numberOfReducers: int @param numberOfReducers: Requested number of reducers. @type frozenClass: bool @param frozenClass: If True, practice serializing and unserializing the class to ensure the independence of the mappers and the reducers. If False, skip this performance-limiting step. @type performanceTable: PerformanceTable or None @param performanceTable: A PerformanceTable for measuring the efficiency of the calculation. @rtype: 3-tuple of list, dict, int @return: List of output records, dictionary of output key-value pairs, and the final iteration number. """ self.logger.info("Start run with %d input records and %d cache keys.", len(inputData), len(self.metadata)) overheadPerformanceTable = PerformanceTable() if performanceTable is None: self._performanceTables = [overheadPerformanceTable] else: self._performanceTables = [ performanceTable, overheadPerformanceTable ] startTime = time.time() overheadPerformanceTable.begin("MapReduce.run") iteration = 0 while True: if iteration < len(self.chain): chain = self.chain[iteration] else: chain = self.chain[-1] overheadPerformanceTable.pause("MapReduce.run") outputRecords, outputKeyValues = self.iterate( inputData, iteration, sort, parallel, numberOfMappers, numberOfReducers, frozenClass) overheadPerformanceTable.unpause("MapReduce.run") if self.done: break if chain: inputData = outputRecords self.logger.info( "Chaining output to next iteration (%d intermediate records).", len(outputRecords)) else: self.logger.info( "Ignoring iteration output (%d ignored records).", len(outputRecords)) iteration += 1 if iterationLimit is not None and iteration >= iterationLimit: self.logger.info("Reached iteration limit of %d.", iterationLimit) self.done = True break overheadPerformanceTable.end("MapReduce.run") endTime = time.time() if self.logger.isEnabledFor(logging.INFO): stream = StringIO() stream.write( "%sPerformanceTable (concurrent times are added together; results are in CPU-seconds):%s%s" % (os.linesep, os.linesep, os.linesep)) self.performanceTable.look(stream=stream) stream.write("%sTotal wall time, including any concurrency: %g%s" % (os.linesep, endTime - startTime, os.linesep)) self.logger.info(stream.getvalue()) self.logger.info("Finished run with %d output records.", len(outputRecords)) return outputRecords, outputKeyValues, iteration