Esempio n. 1
0
 def setUpCluster(self):
     """ 
         Setup Hadoop Cluster 
     """
     
     self.startPilot()
     pcs = self.getPilotComputes()
     
     logger.info("Setup Hadoop Cluster")
     i=0        
     hadoopSetupTasks =[]       
     for pilot in pcs:
         setUpTask = {}                        
         desc = util.getEmptyDU(self._pilots[i]['pilot_compute'])
         self._pilotInfo[i]['hadoopConfDir'] = self.compute_data_service.submit_data_unit(desc)
         setUpTask = util.setAffinity(setUpTask, self._pilotInfo[i]['hadoopConfDir'].data_unit_description)
         setUpTask['output_data'] = [
                                      {
                                       self._pilotInfo[i]['hadoopConfDir'].get_url(): ['mapred-site.xml','core-site.xml','slaves']
                                      }
                                    ]
         setUpTask['executable'] = "python"
         nodes = pilot.get_nodes()
         setUpTask['arguments'] = [self._setupScript, ",".join(nodes)]
         
         hadoopSetupTasks.append(self.compute_data_service.submit_compute_unit(setUpTask))            
         i=i+1        
     util.waitCUs(hadoopSetupTasks)
     logger.info("Cluster ready")
Esempio n. 2
0
 def _chunk(self): 
     """ Chunks input data if Chunk task is defined """
     
     if self._chunkDesc:               
         """ for each file in inputDU create a Chunk task """
         logger.debug("Chunking input data")
         chunkCUs = []
         try:
             for inputDu in self._inputDus:
                 temp = util.getEmptyDU(inputDu.data_unit_description)
                 temp = self.compute_data_service.submit_data_unit(temp)
                 temp.wait()
                 for fName in inputDu.list_files():
                     # for user defined ChunkDesc assign affinity.
                     self._chunkDesc = util.setAffinity(self._chunkDesc, inputDu.data_unit_description)
                     # Pass the input filename and output filename as arguments.
                     self._chunkDesc['arguments'] = [fName, "%s-%s" % (fName, constant.CHUNK_FILE_PREFIX)]
                     # Collect chunked files in output_du
                     self._chunkDesc['output_data'] = [ { temp.get_url(): ['*-chunk-*'] } ]
                     # Get input file to Chunk CU. 
                     self._chunkDesc["input_data"] = [ {inputDu.get_url(): [fName]} ] 
                     if self._chunkExe is not None:
                         self._chunkDesc["input_data"].append(self._chunkExe.get_url())
                                                         
                     chunkCUs.append(self.compute_data_service.submit_compute_unit(self._chunkDesc))
                 self._chunkDus.append(temp)
     
             # Wait for the chunk DUS    
             logger.debug("Wait for chunk DUS/CUS")            
             util.waitDUs(self._chunkDus)
             util.waitCUs(chunkCUs)
         except Exception, ex:
             self._clean(ex, "Chunk failed - Abort")
Esempio n. 3
0
 def _loadExecutables(self):
     """ Loads  executables into Pilot-Data """
     
     if self._chunkDesc and self._chunkDesc.get('files', None):            
         desc = util.getEmptyDU(self._pilots[0]['pilot_compute'])
         desc['file_urls'] = self._chunkDesc['files']
         self._chunkExe = self.compute_data_service.submit_data_unit(desc)
         
     if self._mapDesc and self._mapDesc.get('files', None):            
         desc = util.getEmptyDU(self._pilots[0]['pilot_compute'])
         desc['file_urls'] = self._mapDesc['files']
         self._mapExe = self.compute_data_service.submit_data_unit(desc)
     
     if self._reduceDesc and self._reduceDesc.get('files', None):            
         desc = util.getEmptyDU(self._pilots[0]['pilot_compute'])
         desc['file_urls'] = self._reduceDesc['files']
         self._reduceExe = self.compute_data_service.submit_data_unit(desc)
     
     # Wait for the executable DUS
     util.waitDUs([self._chunkExe, self._mapExe, self._reduceExe])
Esempio n. 4
0
    def _reduce(self):
        """ Reduce Phase """
        
        logger.debug("Creating DUS to store Reduce Output results")
        # Create DU to collect output data of all the reduce tasks
        temp = util.getEmptyDU(self._pilots[0]['pilot_compute'])
        self._outputDu = self.compute_data_service.submit_data_unit(temp)                
        util.waitDUs([self._outputDu])

        # Create reduce for each reduce DU 
        reduceCUs = [] 
        pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path)
        outputDir = os.path.join(pdString,self._outputDu.get_url().split(":")[-1]) 
        reduceArgs = self._reduceDesc.get('arguments', []) 
        
        rtemp=[]
        for rdu in self.reduceDus:
            mapOutPath=os.path.join(self.pdUrl.path,rdu.get_url().split(":")[-1])
            rduFiles = [os.path.join(mapOutPath,f) for f in os.listdir(mapOutPath)]                
            rdu.add_files(rduFiles, exists=True)
            rtemp.append(rdu)
        util.waitDUs(rtemp)
        
                        
       
        try:
            for rdu in self.reduceDus:                
                reduceTask = util.setAffinity(copy.copy(self._reduceDesc), rdu.data_unit_description)
                reduceTask['input_data'] = [rdu.get_url()]
                reduceFiles = []                
                                
                if self._iterOutputPrefixes:                    
                    for pref in self._iterOutputPrefixes:
                        reduceFiles.append(pref+"*")
                else:
                    reduceFiles.append('reduce-*')

                reduceTask['arguments'] = [outputDir, ",".join(reduceFiles)] + reduceArgs

                    
                if self._reduceExe is not None:
                    reduceTask["input_data"].append(self._reduceExe.get_url())                    
                reduceCUs.append(self.compute_data_service.submit_compute_unit(reduceTask))
               
            # Wait for the map DUS and CUS 
            logger.debug("Create & submitting Reduce tasks")                
            util.waitCUs(reduceCUs)
        except Exception, ex:
            self._clean(ex, "Reduce Phase failed - Abort")                  
Esempio n. 5
0
 def _loadInputData(self):
     """ Loads  input data into Pilot-Data """
     
     for pilot in self._pilots:
         if pilot['input_url'].startswith('redis'):
             # reconnect to Pilot-Data
             self._inputDus.append(util.getDuUrl(pilot['input_url']))                
         else:      
             desc = util.getEmptyDU(pilot['pilot_compute'])
             desc['file_urls'] = util.getFileUrls(pilot['input_url'], self._pdFTP)
             temp = self.compute_data_service.submit_data_unit(desc)
             pilot['input_url'] = temp.get_url()
             self._inputDus.append(temp)
     util.waitDUs(self._inputDus)
     
     logger.debug("New Pilot-MapReduce descriptions with updated PD URLS \n"  \
                 "use these descriptions to reuse already uploaded data")
     map(lambda x: logger.debug(x), self._pilots)
Esempio n. 6
0
    def _map(self):
        """ Map Phase """
        
        # Create output DUS one for each reduce to collect all the Map Task results 
        logger.debug("Creating DUS to store Map Output results")
        for _ in range(self._nbrReduces):
            temp = util.getEmptyDU(self._pilots[0]['pilot_compute'])
            self.reduceDus.append(self.compute_data_service.submit_data_unit(temp))        
        util.waitDUs(self.reduceDus)
        
        pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path)
        rduDirs = [os.path.join(pdString,rdu.get_url().split(":")[-1]) for rdu in self.reduceDus]
        rduString = ",".join(rduDirs)                        
        

        # Create task for each chunk in all the chunk data units
        
        mapCUs = []
        try:
            for cdu in self._chunkDus:
                for cfName in cdu.list_files():
                    mapTask = util.setAffinity(copy.copy(self._mapDesc), cdu.data_unit_description)
                    mapTask['arguments'] = [cfName, rduString] + self._mapDesc.get('arguments', [])
                    mapTask["input_data"] = [ {cdu.get_url(): [cfName]} ]                    
                    if self._iterDu:
                        mapTask["input_data"].append(self._iterDu.get_url())
                        for dui in self._iterDu.to_dict()["data_unit_items"]:
                            mapTask["arguments"].append(dui.__dict__["filename"])
                    if self._mapExe is not None:
                        mapTask["input_data"].append(self._mapExe.get_url())                        
                    mapCUs.append(self.compute_data_service.submit_compute_unit(mapTask))
        
            # Wait for the map DUS and CUS   
            logger.debug("Create & submitting Map tasks")             
            util.waitCUs(mapCUs)
        except Exception, ex:
            self._clean(ex, "Map Phase failed - Abort")