def Reduce(self, partitionFiles): '\n Method to call the user Reduce function on unique <key, list(values)>\n ' countReduceCalls = 0 countSkippedRecords = 0 result = list() listKeyValue = Utils.loadFromFiles(partitionFiles, 'rb') partitionedData = defaultdict(list) for (key, value) in listKeyValue: partitionedData[key].append(value) if self.orderingGaurantee: listKeys = sorted(partitionedData.keys()) else: listKeys = partitionedData.keys() for key in listKeys: try: result.append( self.reduceFunc((key, partitionedData[key]))) countReduceCalls+=1 except: if self.skipBadRecords: countSkippedRecords+=1 else: raise return (result, countReduceCalls, countSkippedRecords)
def main(self): '\n Main method invoked when process starts, represents the main flow of execution\n ' self.myPrint('***** Master *****') self.myPrint('Input Splits: %s ' % (str([inputSplit.inputData for inputSplit in self.inputSplits]))) startTime = time.time() self.initialSplitsAssignmentToWorkers(self.inputSplits, self.mapWorkers) pingFreq = int(self.workerTimeout / (3)) lastPingedTime = time.time() while any(((not inputSplit.isProcessedBy) for inputSplit in self.inputSplits)): self._label_('ReceiveResults') currentTime = time.time() if (int(currentTime - (lastPingedTime)) >= pingFreq): self._label_('pingAndCheckWorkers') self.pingAndCheckWorkers(self.mapWorkers, currentTime) lastPingedTime = currentTime if (not all((inputSplit.isProcessedBy for inputSplit in self.inputSplits))): raise RuntimeError('all inputSplits are not processed') self.send(('ExitCommand',), self.mapWorkers.keys()) mapOutputFiles = list() for mapWorker in self.mapWorkers.keys(): mapOutputFiles.extend( self.mapWorkers[mapWorker].outputFiles.values()) self.myPrint('Map output files: %s' % (mapOutputFiles)) self.myPrint('********* Map step completed *********') listKeyValue = Utils.loadFromFiles(mapOutputFiles, 'rb') if self.debugFlag: totalMapValues = sum((v for (k, v) in listKeyValue)) self.myPrint(totalMapValues) self.myPrint(listKeyValue) partitionOuputFiles = defaultdict(list) for mapWorker in self.mapWorkers.keys(): for partitionKey in self.mapWorkers[mapWorker].outputFiles.keys(): partitionOuputFiles[partitionKey].append(self.mapWorkers[mapWorker].outputFiles[partitionKey]) self.myPrint('Grouped partition files: %s' % (partitionOuputFiles)) for key in partitionOuputFiles.keys(): self.partitionSplits.append( InputMetaData(partitionOuputFiles[key])) self.initialSplitsAssignmentToWorkers(self.partitionSplits, self.reduceWorkers) pingFreq = int(self.workerTimeout / (3)) lastPingedTime = time.time() while any(((not partitionSplit.isProcessedBy) for partitionSplit in self.partitionSplits)): self._label_('ReceiveResults') currentTime = time.time() if (int(currentTime - (lastPingedTime)) >= pingFreq): self._label_('pingAndCheckWorkers') self.pingAndCheckWorkers(self.reduceWorkers, currentTime) lastPingedTime = currentTime self.send(('ExitCommand',), self.reduceWorkers.keys()) reduceOutputFiles = list() for reduceWorker in self.reduceWorkers.keys(): reduceOutputFiles.extend(self.reduceWorkers[reduceWorker].outputFiles) self.myPrint('Reduce output files: %s' % (reduceOutputFiles)) self.myPrint('********* Reduce step completed *********') listKeyValue = Utils.loadFromFiles(reduceOutputFiles, 'rb') if self.debugFlag: totalReduceValues = sum((v for (k, v) in listKeyValue)) self.myPrint(totalReduceValues) self.myPrint(listKeyValue) if self.debugFlag: if (totalMapValues != totalReduceValues): raise RuntimeError('outputs of Map and Reduce steps do not match') endTime = time.time() self.getStats() self._label_('exit') self.myPrint('Master Exiting') print('\n') print('No of Mappers: %d' % (len(self.mapWorkers))) print('No of Reducers: %d' % (len(self.reduceWorkers))) print('No of Map tasks: %d' % (self.noOfMapTasks())) print('No of Reduce tasks: %d' % (self.noOfReduceTasks())) print('No of failed Map workers: %d' % (self.noOfFailedMapWorkers())) print('No of failed Reduce workers: %d' % (self.noOfFailedReduceWorkers())) print('No of Map function calls: %d' % (self.noOfUserMapCalls())) print('No of Reduce function calls: %d' % (self.noOfUserReduceCalls())) print('Total time elapsed: %.2fsec' % (endTime - (startTime)))