def main(): sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS) # Debug #for c in chunks: #print(c) q = Queue() pattern = re.compile(settings.TARGET_USERNAME) # consumer #con = multiprocessing.Process(target=opener, args=(cat(grep(pattern, writer())),)) #con.daemon = True #con.start() # producer producers = [] file_handles = [] for chunk in chunks: fh = open(sfile, "r") file_handles.append(fh) o = opener(cat(chunk, grep(pattern, writer(q)))) t = multiprocessing.Process(target=sender, args=(o,)) t.daemon = True producers.append(t) for p in producers: p.start() for p in producers: p.join() #con.join() q.put(None) # sentinel for f in file_handles: f.close() recsmatch = 0 print("Before queue comp") while True: x = q.get() if x == None: break recsmatch += 1 print("After queue comp") print("recsmatch={r} chunks={c}".format(r=recsmatch, c=settings.BIGFILE_MP_CHUNKS))
def main(): sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS) # Debug # for c in chunks: # print(c) q = Queue() pattern = re.compile(settings.TARGET_USERNAME) # consumer # con = multiprocessing.Process(target=opener, args=(cat(grep(pattern, writer())),)) # con.daemon = True # con.start() # producer producers = [] file_handles = [] for chunk in chunks: fh = open(sfile, "r") file_handles.append(fh) o = opener(cat(chunk, grep(pattern, writer(q)))) t = multiprocessing.Process(target=sender, args=(o,)) t.daemon = True producers.append(t) for p in producers: p.start() for p in producers: p.join() # con.join() q.put(None) # sentinel for f in file_handles: f.close() recsmatch = 0 print("Before queue comp") while True: x = q.get() if x == None: break recsmatch += 1 print("After queue comp") print("recsmatch={r} chunks={c}".format(r=recsmatch, c=settings.BIGFILE_MP_CHUNKS))
def main(): start = time.time() logger = dry.logger.setup_log_size_rotating("log/bigfile_futures_threadpool.log", logname='bigfilefuturesthreads') logger.info("START") elapsed_time = [] sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: #A list of tuples (chunk_start, chunk_size) chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_FUTURES_CHUNKS) pattern = re.compile(settings.TARGET_USERNAME) file_handles = [] for j in range(len(chunks)): file_handles.append(open(sfile, "r")) with futures.ThreadPoolExecutor(max_workers=settings.BIGFILE_FUTURES_CHUNKS) as executor: future_to_chunk = dict( (executor.submit(find_noq, file_handles[i], chunks[i], pattern), "") \ for i in range(len(chunks)) ) recsmatch = 0 try: for future in futures.as_completed(future_to_chunk, timeout=60): recsmatch += future.result() except Exception as e: #traceback.print_exc(file=sys.stdout) logger.error("recsmatch={m} e={e}".format(m=recsmatch, e=e)) return elapsed_time.append(time.time() - start) elapsed_time_str = "" for t in elapsed_time: elapsed_time_str += str(t)+"," elapsed_time_str.strip,(",") print("{r}".format(r=recsmatch)) logger.info("STOP|elapsedtime:{et}|recsmatch:{r}".format(et=elapsed_time, r=recsmatch))
# Split the file into four chunks, process each chunk in a separate process, # count the number of matching records, report via a queue, calculate total. import os import sys import re import time import multiprocessing from multiprocessing import JoinableQueue as Queue import settings from bigfile.bigfile import chunk_end, size_chunks, find, count_matches sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS) # Debug #for c in chunks: #print(c) q = Queue() pattern = re.compile(settings.TARGET_USERNAME) # consumer con = multiprocessing.Process(target=count_matches, args=(q,)) con.daemon = True con.start() # producer producers = []
break recsmatch += matches print(recsmatch) # Start Execution if len(sys.argv) < 1: print("usage: %prog") sys.exit(1) sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_GEVENT_CHUNKS) pattern = re.compile(settings.TARGET_USERNAME) # maxsize = 0 makes the queue act like a channel. The queue will block # until a get call retrieves the data. In effect, it works like a CSP. q = gevent.queue.Queue(maxsize=0) # consumer con = gevent.spawn(count_matches, q) # producer fhandles = [open(sfile, "r") for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS)] jobs = [gevent.spawn(find, fhandles[i], chunks[i], pattern, q) for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS)] gevent.joinall(jobs, timeout=10)
else: from Queue import Queue import settings from bigfile.bigfile import size_chunks, chunk_end, find, count_matches # Start Execution if len(sys.argv) < 1: print("usage: bigfile_chunks") sys.exit(1) sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_THREADS_CHUNKS) q = Queue() pattern = re.compile(settings.TARGET_USERNAME) # consumer # Use write_lines if you want an report of matches #con = threading.Thread(target=write_lines, args=(q, fh_out)) con = threading.Thread(target=count_matches, args=(q, )) con.daemon = True con.start() # producer producers = [] file_handles = [] for chunk in chunks:
recsmatch += matches print(recsmatch) # Start Execution if len(sys.argv) < 1: print("usage: %prog") sys.exit(1) sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_GEVENT_CHUNKS) pattern = re.compile(settings.TARGET_USERNAME) # maxsize = 0 makes the queue act like a channel. The queue will block # until a get call retrieves the data. In effect, it works like a CSP. q = gevent.queue.Queue(maxsize=0) # consumer con = gevent.spawn(count_matches, q) # producer fhandles = [ open(sfile, "r") for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS) ] jobs = [
else: from Queue import Queue import settings from bigfile.bigfile import size_chunks, chunk_end, find, count_matches # Start Execution if len(sys.argv) < 1: print("usage: bigfile_chunks") sys.exit(1) sfile = settings.BIG_FILE fsize = os.path.getsize(sfile) with open(sfile, "r") as fh: chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_THREADS_CHUNKS) q = Queue() pattern = re.compile(settings.TARGET_USERNAME) # consumer # Use write_lines if you want an report of matches #con = threading.Thread(target=write_lines, args=(q, fh_out)) con = threading.Thread(target=count_matches, args=(q,)) con.daemon = True con.start() # producer producers = [] file_handles = [] for chunk in chunks: