Ejemplo n.º 1
0
 def run(self,input_reader,output_writer):
     
     start_time = datetime.datetime.now()
     self._mapred.reset()
     
     print "INFO:start job %s on a single core" % self._mapred.__class__.__name__
     
     self._mapred.run_map(input_reader)
     
     if "combine" in dir(self._mapred):
         self._mapred.run_combine(self._mapred.data.items())
         
     
     if "reduce" not in dir(self._mapred):
         self._mapred.data_reduced = self._mapred.data
     else:
         self._mapred.run_reduce(self._mapred.data.items())
         
     output_writer.write(self._mapred.post_reduce())
     
     print "INFO: end job %s in %s with mem size of %d" % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self._mapred))
Ejemplo n.º 2
0
    def profile(self, input_reader,sample_size=100, max_memory=1000,core=cpu_count()-1,hadoop_nodes=4):
        """
        Profile the MapReduce job against the input reader and return recommandation + diagnostics
        @param max_memory: SMP memory limit availaible for the job in Mb (default : 1Gb)
        
        @return: recommanded engine name, diagnostic data
        
        """

        
        diagnostics = {}
        
        if input_reader.is_distant():
            return HADOOP, diagnostics
        
        
        total_size = input_reader.get_estimated_size()
       
        map_delay = 0.0
        
        self.reset()
        
        if 'map' in dir(self):
            for line in input_reader.sample(sample_size):
                start = time.time()
                self.map(line)
                map_delay += time.time() - start
                
                
                
        elif 'map_partition' in dir(self):
            start = time.time()
            self.map_partition(input_reader.sample(sample_size))
            map_delay += time.time() - start
            
        else:
            raise Exception("ERROR: You have to implement a map() or map_partition() method")

        
        sample_size = sample_size if total_size >= sample_size else total_size
        
        mean_map_delay = map_delay / sample_size
        
        map_data_mem = mem.asizeof(self.data)/1000000.0 * total_size / sample_size
        
        diagnostics['estimated-input-size'] = total_size
        diagnostics['mean-map-delay'] = mean_map_delay
        diagnostics['estimated-mem-size'] = map_data_mem
        
        
        if map_data_mem >= max_memory:
            engine= HADOOP 
            diagnostics['estimated-delay'] = total_size * mean_map_delay / hadoop_nodes
        else:
            
            if mean_map_delay >= 1.0e-4:
                
                if map_data_mem * core >= max_memory:
                    engine = HADOOP 
                    diagnostics['estimated-delay'] = total_size * mean_map_delay / hadoop_nodes
                else:
                    engine = MULTI_CORE
                    diagnostics['estimated-delay'] = total_size * mean_map_delay / (core if core > 0 else 1)
            else:
                engine = SINGLE_CORE 
                diagnostics['estimated-delay'] = total_size * mean_map_delay
        
        return engine,diagnostics
Ejemplo n.º 3
0
 def run(self,input_reader,output_writer,cpu=cpu_count()-1, cache_line=100000):
     
     start_time = datetime.datetime.now()
     
     print "INFO: start job %s on %d cores" % (self._mapred.__class__.__name__, cpu)
     
     self._run_map(cpu, cache_line, input_reader)
             
     
     if "reduce" not in dir(self._mapred):
         
         self._mapred.data_reduced = self._mapred.data
     
     else:
         if len(self._mapred.data) < cpu:
             self._mapred.run_reduce(self._mapred.data.items())
         else:
             self._run_reduce(cpu)
         
     output_writer.write(self._mapred.post_reduce())
     
     print "INFO: end job %s in %s with mem size of %d"  % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self))
Ejemplo n.º 4
0
    def run(self,input_reader,output_writer):
        
        start_time = datetime.datetime.now()
        
               
        print "INFO: start job %s on hadoop" % (self._mapred.__class__.__name__)
        
        #hadoop_home
        #hadoop = HadoopClient()
        hdfs_web_url = 'http://sandbox:50070/?user.name=predictiveds' #getenv('HDFS_WEB_URL')
        (scheme,hostport, path,params,query,fragment) = urlparse(hdfs_web_url)
        host,port = hostport.split(':',2)
        hadoop = WebHdfsClient(host,port,query)
        
        if getenv('POLYMR_HOME') is None:
            print "ERROR : $POLYMR_HOME have to be set to polymr home directory"
            raise SystemError("$POLYMR_HOME have to be set to polymr home directory")
        
        
        #Set metadata
        format_class = input_reader.formatter.__class__
        input_source_file = inspect.getfile(format_function)
        input_module_name = format_class.__module__
        input_class_name = format_class.__name__
        
        input_format_source = "format = %s" % inspect.getsource(format_function)
        
        self._mapred.params['_input_meta'] = { 
                'input_class_name': input_class_name,
                'input_module_name' : input_module_name,
                'input_source' : input_source_file,
                'input_options' : input_reader.formatter.options
                } 
        
        #store params to broadcast to hadoop
        params_file_id = str(uuid.uuid1())
        cache_filename = '/var/tmp/%s' % params_file_id 
        f = open(cache_filename,mode='w')
        f.write(json.dumps(self._mapred.params))
        f.close()
        
        #Manage the input types
        if input_reader.is_distant():
            hdfs_input = input_reader.filename        
        else :
            hdfs_input = ".tmp/input-%s" % str(uuid.uuid1())
            hadoop.put_file(input_reader.to_file(), hdfs_input)
            
        
        if output_writer.is_distant():
            output_id = output_writer.fileName
        else:
            output_id = ".tmp/output-%s" % str(uuid.uuid1())
        
        #dummy hadoop simulation as command pipes
        cmds = "$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*streaming*.jar -archives $POLYMR_HOME/polymr.zip#polymr -files $POLYMR_HOME/streamer.py,%s,%s,%s -input %s -output %s -mapper 'streamer.py mapper %s %s %s'" % (self._source_file, cache_filename, input_source_file, hdfs_input, output_id, self._module_name,self._class_name,params_file_id)
        if "combine" in dir(self._mapred):
            cmds += " -combiner 'streamer.py combiner %s %s %s'" % (self._module_name,self._class_name,params_file_id)

        if "reduce" in dir(self._mapred):
            cmds += " -reducer 'streamer.py reducer %s %s %s'" % (self._module_name,self._class_name,params_file_id)
        
        
        print "INFO: %s" % cmds
        subprocess.check_output(cmds,shell=True)
        
        # get result
        def load_line(line):
            key, value = line.split(";",2)
            self._mapred.data_reduced[key] = [json.loads(value)]
        
        
        if output_writer.is_distant():
            pass # nothing to do
        elif output_writer.is_memory():
            output = hadoop.cat("%s"% output_id)
            map(load_line,output.strip().split("\n"))
            output_writer.write(self._mapred.post_reduce())
        else :
            hadoop.get_file(output_id, output_writer.filename)

            
        #Clean up
        if not input_reader.is_distant() :
            hadoop.rm(hdfs_input)
        
        if not output_writer.is_distant() :
            hadoop.rm(output_id)
         
        
        print "INFO: end job %s in %s with mem size of %d"  % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self))