Beispiel #1
0
def parse_file(file_name, cores, jobs_per_core, stats):
    file_size = os.path.getsize(file_name)
    chunks = int(cores * jobs_per_core)
    stats.initial_report(file_name, file_size, chunks, cores)
    
    queue = pprocess.Queue(limit=cores)
    parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk))
    
    temp_dir = tempfile.mkdtemp('.wf2')
    try:
        for (start, end) in file_offsets(file_size, chunks):
            parse_chunk_async(file_name, temp_dir, start, end)
        
        total = dict( (count_name, LogCounter(count_name)) for count_name in count_names )
        
        stats.waiting()
        for (temp_file_name, job_pid, job_time) in queue:
            stats.received_job_result()
            start_reduce_time = time.time()
            
            mapper = pickle.load(open(temp_file_name, 'rb'))
            for (count_name, counter) in mapper.get_counters().iteritems():
                total[count_name].add_counter(counter)
            os.remove(temp_file_name)
            
            stats.job_report(job_pid, job_time, time.time() - start_reduce_time)
            stats.waiting()
    
    finally:
        shutil.rmtree(temp_dir)
    
    for name in count_names:
        print total[name].report()
Beispiel #2
0
def parse_file(file_name, cores, jobs_per_core, stats):
    file_size = os.path.getsize(file_name)
    chunks = int(cores * jobs_per_core)
    
    queue = pprocess.Queue(limit=cores)
    parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk))
    
    temp_dir = tempfile.mkdtemp('.wf2')
    
    mappers = {}
    for count_name in count_names:
        mappers[count_name] = pprocess.start(map_count, count_name, temp_dir)
    
    c = 0
    for (start, end) in file_offsets(file_size, chunks):
        c += 1
        parse_chunk_async(file_name, temp_dir, start, end, c)
    
    stats._output('all map jobs queued')
    for job_id in queue:
        start_reduce_time = time.time()
        
        stats._output('map job finished: pid=%d' % job_id)
        for c in range(len(count_names)):
            mappers[count_names[c]].send(job_id)
    
    stats._output('all map jobs finished')
    for (mapper, count_name) in ((mappers[count_name], count_name) for count_name in count_names):
        mapper.send(None)
        print pickle.load(open(mapper.receive(), 'rb'))
        stats._output('reduce job finished: name=%s' % count_name)
    
    shutil.rmtree(temp_dir)
Beispiel #3
0
def parse_file(file_name, cores, jobs_per_core, stats):
    file_size = os.path.getsize(file_name)
    chunks = int(cores * jobs_per_core)
    stats.initial_report(file_name, file_size, chunks, cores)
    
    queue = pprocess.Queue(limit=cores)
    parse_chunk_async = queue.manage(pprocess.MakeParallel(parse_chunk))
    
    for (start, end) in file_offsets(file_size, chunks):
        parse_chunk_async(file_name, start, end)
    
    total = dict( (count_name, LogCounter(count_name)) for count_name in count_names )
    
    stats.waiting()
    for (mapper, job_pid, job_time) in queue:
        stats.received_job_result()
        start_reduce_time = time.time()
        
        for (count_name, counter) in mapper.get_counters().iteritems():
            total[count_name].add_counter(counter)
        
        stats.job_report(job_pid, job_time, time.time() - start_reduce_time)
        stats.waiting()
    
    for name in count_names:
        print total[name].report()
Beispiel #4
0
 def read_seek():
     for (start, end) in file_offsets(os.path.getsize(file_name), n_pieces):
         # print "reading piece: %d -> %d" % (start, end)
         for line in seek_open(file_name, start, end):
             yield line