Example #1
0
 def Reduce(self, partitionFiles):
     '\n        Method to call the user Reduce function on unique <key, list(values)>\n        '
     countReduceCalls = 0
     countSkippedRecords = 0
     result = list()
     listKeyValue = Utils.loadFromFiles(partitionFiles, 'rb')
     partitionedData = defaultdict(list)
     for (key, value) in listKeyValue:
         partitionedData[key].append(value)
     if self.orderingGaurantee:
         listKeys = sorted(partitionedData.keys())
     else:
         listKeys = partitionedData.keys()
     for key in listKeys:
         try:
             result.append(
             self.reduceFunc((key, partitionedData[key])))
             countReduceCalls+=1
         except:
             if self.skipBadRecords:
                 countSkippedRecords+=1
             else:
                 raise 
     return (result, countReduceCalls, countSkippedRecords)
Example #2
0
 def main(self):
     '\n        Main method invoked when process starts, represents the main flow of execution\n        '
     self.myPrint('***** Master *****')
     self.myPrint('Input Splits: %s ' % (str([inputSplit.inputData for inputSplit in self.inputSplits])))
     startTime = time.time()
     self.initialSplitsAssignmentToWorkers(self.inputSplits, self.mapWorkers)
     pingFreq = int(self.workerTimeout / (3))
     lastPingedTime = time.time()
     while any(((not inputSplit.isProcessedBy) for inputSplit in self.inputSplits)):
         self._label_('ReceiveResults')
         currentTime = time.time()
         if (int(currentTime - (lastPingedTime)) >= pingFreq):
             self._label_('pingAndCheckWorkers')
             self.pingAndCheckWorkers(self.mapWorkers, currentTime)
             lastPingedTime = currentTime
     if (not all((inputSplit.isProcessedBy for inputSplit in self.inputSplits))):
         raise 
         RuntimeError('all inputSplits are not processed')
     self.send(('ExitCommand',), 
     self.mapWorkers.keys())
     mapOutputFiles = list()
     for mapWorker in self.mapWorkers.keys():
         mapOutputFiles.extend(
         self.mapWorkers[mapWorker].outputFiles.values())
     self.myPrint('Map output files: %s' % (mapOutputFiles))
     self.myPrint('********* Map step completed *********')
     listKeyValue = Utils.loadFromFiles(mapOutputFiles, 'rb')
     if self.debugFlag:
         totalMapValues = sum((v for (k, v) in listKeyValue))
         self.myPrint(totalMapValues)
     self.myPrint(listKeyValue)
     partitionOuputFiles = defaultdict(list)
     for mapWorker in self.mapWorkers.keys():
         for partitionKey in self.mapWorkers[mapWorker].outputFiles.keys():
             partitionOuputFiles[partitionKey].append(self.mapWorkers[mapWorker].outputFiles[partitionKey])
     self.myPrint('Grouped partition files: %s' % (partitionOuputFiles))
     for key in partitionOuputFiles.keys():
         self.partitionSplits.append(
         InputMetaData(partitionOuputFiles[key]))
     self.initialSplitsAssignmentToWorkers(self.partitionSplits, self.reduceWorkers)
     pingFreq = int(self.workerTimeout / (3))
     lastPingedTime = time.time()
     while any(((not partitionSplit.isProcessedBy) for partitionSplit in self.partitionSplits)):
         self._label_('ReceiveResults')
         currentTime = time.time()
         if (int(currentTime - (lastPingedTime)) >= pingFreq):
             self._label_('pingAndCheckWorkers')
             self.pingAndCheckWorkers(self.reduceWorkers, currentTime)
             lastPingedTime = currentTime
     self.send(('ExitCommand',), 
     self.reduceWorkers.keys())
     reduceOutputFiles = list()
     for reduceWorker in self.reduceWorkers.keys():
         reduceOutputFiles.extend(self.reduceWorkers[reduceWorker].outputFiles)
     self.myPrint('Reduce output files: %s' % (reduceOutputFiles))
     self.myPrint('********* Reduce step completed *********')
     listKeyValue = Utils.loadFromFiles(reduceOutputFiles, 'rb')
     if self.debugFlag:
         totalReduceValues = sum((v for (k, v) in listKeyValue))
         self.myPrint(totalReduceValues)
     self.myPrint(listKeyValue)
     if self.debugFlag:
         if (totalMapValues != totalReduceValues):
             raise 
             RuntimeError('outputs of Map and Reduce steps do not match')
     endTime = time.time()
     self.getStats()
     self._label_('exit')
     self.myPrint('Master Exiting')
     print('\n')
     print('No of Mappers: %d' % (len(self.mapWorkers)))
     print('No of Reducers: %d' % (len(self.reduceWorkers)))
     print('No of Map tasks: %d' % (self.noOfMapTasks()))
     print('No of Reduce tasks: %d' % (self.noOfReduceTasks()))
     print('No of failed Map workers: %d' % (self.noOfFailedMapWorkers()))
     print('No of failed Reduce workers: %d' % (self.noOfFailedReduceWorkers()))
     print('No of Map function calls: %d' % (self.noOfUserMapCalls()))
     print('No of Reduce function calls: %d' % (self.noOfUserReduceCalls()))
     print('Total time elapsed: %.2fsec' % (endTime - (startTime)))