Example #1
0
 def setUpCluster(self):
     """ 
         Setup Hadoop Cluster 
     """
     
     self.startPilot()
     pcs = self.getPilotComputes()
     
     logger.info("Setup Hadoop Cluster")
     i=0        
     hadoopSetupTasks =[]       
     for pilot in pcs:
         setUpTask = {}                        
         desc = util.getEmptyDU(self._pilots[i]['pilot_compute'])
         self._pilotInfo[i]['hadoopConfDir'] = self.compute_data_service.submit_data_unit(desc)
         setUpTask = util.setAffinity(setUpTask, self._pilotInfo[i]['hadoopConfDir'].data_unit_description)
         setUpTask['output_data'] = [
                                      {
                                       self._pilotInfo[i]['hadoopConfDir'].get_url(): ['mapred-site.xml','core-site.xml','slaves']
                                      }
                                    ]
         setUpTask['executable'] = "python"
         nodes = pilot.get_nodes()
         setUpTask['arguments'] = [self._setupScript, ",".join(nodes)]
         
         hadoopSetupTasks.append(self.compute_data_service.submit_compute_unit(setUpTask))            
         i=i+1        
     util.waitCUs(hadoopSetupTasks)
     logger.info("Cluster ready")
Example #2
0
 def _chunk(self): 
     """ Chunks input data if Chunk task is defined """
     
     if self._chunkDesc:               
         """ for each file in inputDU create a Chunk task """
         logger.debug("Chunking input data")
         chunkCUs = []
         try:
             for inputDu in self._inputDus:
                 temp = util.getEmptyDU(inputDu.data_unit_description)
                 temp = self.compute_data_service.submit_data_unit(temp)
                 temp.wait()
                 for fName in inputDu.list_files():
                     # for user defined ChunkDesc assign affinity.
                     self._chunkDesc = util.setAffinity(self._chunkDesc, inputDu.data_unit_description)
                     # Pass the input filename and output filename as arguments.
                     self._chunkDesc['arguments'] = [fName, "%s-%s" % (fName, constant.CHUNK_FILE_PREFIX)]
                     # Collect chunked files in output_du
                     self._chunkDesc['output_data'] = [ { temp.get_url(): ['*-chunk-*'] } ]
                     # Get input file to Chunk CU. 
                     self._chunkDesc["input_data"] = [ {inputDu.get_url(): [fName]} ] 
                     if self._chunkExe is not None:
                         self._chunkDesc["input_data"].append(self._chunkExe.get_url())
                                                         
                     chunkCUs.append(self.compute_data_service.submit_compute_unit(self._chunkDesc))
                 self._chunkDus.append(temp)
     
             # Wait for the chunk DUS    
             logger.debug("Wait for chunk DUS/CUS")            
             util.waitDUs(self._chunkDus)
             util.waitCUs(chunkCUs)
         except Exception, ex:
             self._clean(ex, "Chunk failed - Abort")
Example #3
0
    def _reduce(self):
        """ Reduce Phase """
        
        logger.debug("Creating DUS to store Reduce Output results")
        # Create DU to collect output data of all the reduce tasks
        temp = util.getEmptyDU(self._pilots[0]['pilot_compute'])
        self._outputDu = self.compute_data_service.submit_data_unit(temp)                
        util.waitDUs([self._outputDu])

        # Create reduce for each reduce DU 
        reduceCUs = [] 
        pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path)
        outputDir = os.path.join(pdString,self._outputDu.get_url().split(":")[-1]) 
        reduceArgs = self._reduceDesc.get('arguments', []) 
        
        rtemp=[]
        for rdu in self.reduceDus:
            mapOutPath=os.path.join(self.pdUrl.path,rdu.get_url().split(":")[-1])
            rduFiles = [os.path.join(mapOutPath,f) for f in os.listdir(mapOutPath)]                
            rdu.add_files(rduFiles, exists=True)
            rtemp.append(rdu)
        util.waitDUs(rtemp)
        
                        
       
        try:
            for rdu in self.reduceDus:                
                reduceTask = util.setAffinity(copy.copy(self._reduceDesc), rdu.data_unit_description)
                reduceTask['input_data'] = [rdu.get_url()]
                reduceFiles = []                
                                
                if self._iterOutputPrefixes:                    
                    for pref in self._iterOutputPrefixes:
                        reduceFiles.append(pref+"*")
                else:
                    reduceFiles.append('reduce-*')

                reduceTask['arguments'] = [outputDir, ",".join(reduceFiles)] + reduceArgs

                    
                if self._reduceExe is not None:
                    reduceTask["input_data"].append(self._reduceExe.get_url())                    
                reduceCUs.append(self.compute_data_service.submit_compute_unit(reduceTask))
               
            # Wait for the map DUS and CUS 
            logger.debug("Create & submitting Reduce tasks")                
            util.waitCUs(reduceCUs)
        except Exception, ex:
            self._clean(ex, "Reduce Phase failed - Abort")                  
Example #4
0
 def stopCluster(self):
     """ Tear down spark cluster """
     
     logger.info("Stopping spark Cluster")
     sparkStopTasks =[]
     i=0
     for pilot in self._pilots:                    
         stopTask = util.setAffinity({}, pilot['pilot_compute'])
         stopTask['executable'] = "python"
         stopTask['input_data'] = [self._pilotInfo[i]['sparkConfDir'].get_url()]
         stopTask['arguments'] = [self._stopScript,  ",".join(self.nodes)]
         
         sparkStopTasks.append(self.compute_data_service.submit_compute_unit(stopTask))            
         i=i+1        
     util.waitCUs(sparkStopTasks)
     self.stopPilot()
Example #5
0
 def submitJob(self,desc):
     
     """ Submit spark Job description """
     logger.info("Submitting Spark Job")
     sparkTasks =[]
     i=0
     for pilot in self._pilots:
         task = {} 
         task.update(desc)                       
         task = util.setAffinity(task, pilot['pilot_compute'])
         task['executable'] = 'SPARK_CONF_DIR=$PWD;' + task['executable']
         task['input_data'] = [self._pilotInfo[i]['sparkConfDir'].get_url()]            
         sparkTasks.append(self.compute_data_service.submit_compute_unit(task))            
         i=i+1
     util.waitCUs(sparkTasks)            
     return sparkTasks   
Example #6
0
 def stopCluster(self):
     """ Tear down Hadoop cluster """
     
     logger.info("Stopping Hadoop Cluster")
     hadoopStopTasks =[]
     i=0
     for pilot in self._pilots:
         stopTask = {}                        
         setUpTask = util.setAffinity(stopTask, pilot['pilot_compute'])
         setUpTask['executable'] = "python"
         setUpTask['input_data'] = [self._pilotInfo[i]['hadoopConfDir'].get_url()]
         setUpTask['arguments'] = [self._stopScript]
         
         hadoopStopTasks.append(self.compute_data_service.submit_compute_unit(setUpTask))            
         i=i+1        
     util.waitCUs(hadoopStopTasks)
     self.stopPilot()
Example #7
0
 def submitJob(self,desc):
     
     """ Submit Hadoop Job description """
     
     logger.info("Submitting Hadoop Jobs")
     hadoopTasks =[]
     i=0
     for pilot in self._pilots:
         task = {} 
         task.update(desc)                       
         task = util.setAffinity(task, pilot['pilot_compute'])
         task['executable'] = 'HADOOP_CONF_DIR=$PWD;' + task['executable']
         task['input_data'] = [self._pilotInfo[i]['hadoopConfDir'].get_url()]            
         hadoopTasks.append(self.compute_data_service.submit_compute_unit(task))            
         i=i+1
     util.waitCUs(hadoopTasks)            
     return hadoopTasks    
Example #8
0
    def _map(self):
        """ Map Phase """
        
        # Create output DUS one for each reduce to collect all the Map Task results 
        logger.debug("Creating DUS to store Map Output results")
        for _ in range(self._nbrReduces):
            temp = util.getEmptyDU(self._pilots[0]['pilot_compute'])
            self.reduceDus.append(self.compute_data_service.submit_data_unit(temp))        
        util.waitDUs(self.reduceDus)
        
        pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path)
        rduDirs = [os.path.join(pdString,rdu.get_url().split(":")[-1]) for rdu in self.reduceDus]
        rduString = ",".join(rduDirs)                        
        

        # Create task for each chunk in all the chunk data units
        
        mapCUs = []
        try:
            for cdu in self._chunkDus:
                for cfName in cdu.list_files():
                    mapTask = util.setAffinity(copy.copy(self._mapDesc), cdu.data_unit_description)
                    mapTask['arguments'] = [cfName, rduString] + self._mapDesc.get('arguments', [])
                    mapTask["input_data"] = [ {cdu.get_url(): [cfName]} ]                    
                    if self._iterDu:
                        mapTask["input_data"].append(self._iterDu.get_url())
                        for dui in self._iterDu.to_dict()["data_unit_items"]:
                            mapTask["arguments"].append(dui.__dict__["filename"])
                    if self._mapExe is not None:
                        mapTask["input_data"].append(self._mapExe.get_url())                        
                    mapCUs.append(self.compute_data_service.submit_compute_unit(mapTask))
        
            # Wait for the map DUS and CUS   
            logger.debug("Create & submitting Map tasks")             
            util.waitCUs(mapCUs)
        except Exception, ex:
            self._clean(ex, "Map Phase failed - Abort")