Exemple #1
0
def main():
    sfile = settings.BIG_FILE
    fsize = os.path.getsize(sfile)
    with  open(sfile, "r") as fh:
        chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS)
    
    # Debug
    #for c in chunks:
        #print(c)
        
    q = Queue()
    pattern = re.compile(settings.TARGET_USERNAME)
    
    # consumer
    #con = multiprocessing.Process(target=opener, args=(cat(grep(pattern, writer())),))
    #con.daemon = True
    #con.start()
    
    # producer
    producers = []
    file_handles = []
    for chunk in chunks:    
        fh = open(sfile, "r")
        file_handles.append(fh)
        o = opener(cat(chunk, grep(pattern, writer(q))))
        t = multiprocessing.Process(target=sender, args=(o,))
        t.daemon = True
        producers.append(t)
        
    for p in producers:
        p.start()
        
    
    for p in producers:
        p.join()
        
    #con.join()
    q.put(None) # sentinel
    
    for f in file_handles:
        f.close()
        
    recsmatch = 0 
    print("Before queue comp")
    while True:
        x = q.get()
        if x == None:
            break
        recsmatch += 1
    print("After queue comp")
        
    
    print("recsmatch={r} chunks={c}".format(r=recsmatch,
                                        c=settings.BIGFILE_MP_CHUNKS))
def main():
    sfile = settings.BIG_FILE
    fsize = os.path.getsize(sfile)
    with open(sfile, "r") as fh:
        chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS)

    # Debug
    # for c in chunks:
    # print(c)

    q = Queue()
    pattern = re.compile(settings.TARGET_USERNAME)

    # consumer
    # con = multiprocessing.Process(target=opener, args=(cat(grep(pattern, writer())),))
    # con.daemon = True
    # con.start()

    # producer
    producers = []
    file_handles = []
    for chunk in chunks:
        fh = open(sfile, "r")
        file_handles.append(fh)
        o = opener(cat(chunk, grep(pattern, writer(q))))
        t = multiprocessing.Process(target=sender, args=(o,))
        t.daemon = True
        producers.append(t)

    for p in producers:
        p.start()

    for p in producers:
        p.join()

    # con.join()
    q.put(None)  # sentinel

    for f in file_handles:
        f.close()

    recsmatch = 0
    print("Before queue comp")
    while True:
        x = q.get()
        if x == None:
            break
        recsmatch += 1
    print("After queue comp")

    print("recsmatch={r} chunks={c}".format(r=recsmatch, c=settings.BIGFILE_MP_CHUNKS))
Exemple #3
0
def main():
    start = time.time()
    logger = dry.logger.setup_log_size_rotating("log/bigfile_futures_threadpool.log", 
                                                logname='bigfilefuturesthreads')
    
    logger.info("START")
    elapsed_time = []
    
    sfile = settings.BIG_FILE
    fsize = os.path.getsize(sfile)
    with  open(sfile, "r") as fh:
        #A list of tuples (chunk_start, chunk_size)
        chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_FUTURES_CHUNKS)
    
    pattern = re.compile(settings.TARGET_USERNAME)
    file_handles = []
    for j in range(len(chunks)):
        file_handles.append(open(sfile, "r"))
    
    with futures.ThreadPoolExecutor(max_workers=settings.BIGFILE_FUTURES_CHUNKS) as executor:
        future_to_chunk = dict( (executor.submit(find_noq, file_handles[i], chunks[i], pattern), "") \
                                for i in range(len(chunks)) )
        
    recsmatch = 0    
    
    try:
        for future in futures.as_completed(future_to_chunk, timeout=60):
            recsmatch += future.result()
    except Exception as e:
        #traceback.print_exc(file=sys.stdout)
        logger.error("recsmatch={m} e={e}".format(m=recsmatch, e=e))
        return
            
    elapsed_time.append(time.time() - start)
            
    elapsed_time_str = ""
    for t in elapsed_time:
        elapsed_time_str += str(t)+","
    elapsed_time_str.strip,(",")
    
    print("{r}".format(r=recsmatch))
    logger.info("STOP|elapsedtime:{et}|recsmatch:{r}".format(et=elapsed_time, r=recsmatch))
Exemple #4
0
# Split the file into four chunks, process each chunk in a separate process,
# count the number of matching records, report via a queue, calculate total.
import os
import sys
import re
import time
import multiprocessing
from multiprocessing import JoinableQueue as Queue
import settings
from bigfile.bigfile import chunk_end, size_chunks, find, count_matches


sfile = settings.BIG_FILE
fsize = os.path.getsize(sfile)
with  open(sfile, "r") as fh:
    chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_MP_CHUNKS)

# Debug
#for c in chunks:
    #print(c)
    
q = Queue()
pattern = re.compile(settings.TARGET_USERNAME)

# consumer
con = multiprocessing.Process(target=count_matches, args=(q,))
con.daemon = True
con.start()

# producer
producers = []
            break
        
        recsmatch += matches

    print(recsmatch)
        
# Start Execution
if len(sys.argv) < 1:
    print("usage: %prog")
    sys.exit(1)
    
sfile = settings.BIG_FILE

fsize = os.path.getsize(sfile)
with open(sfile, "r") as fh:
    chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_GEVENT_CHUNKS)

pattern = re.compile(settings.TARGET_USERNAME)

# maxsize = 0 makes the queue act like a channel.  The queue will block
# until a get call retrieves the data.  In effect, it works like a CSP.
q = gevent.queue.Queue(maxsize=0)

# consumer
con = gevent.spawn(count_matches, q)

# producer
fhandles = [open(sfile, "r") for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS)]
jobs = [gevent.spawn(find, fhandles[i], chunks[i], pattern, q) for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS)]
gevent.joinall(jobs, timeout=10)
Exemple #6
0
else:
    from Queue import Queue

import settings
from bigfile.bigfile import size_chunks, chunk_end, find, count_matches

# Start Execution
if len(sys.argv) < 1:
    print("usage: bigfile_chunks")
    sys.exit(1)

sfile = settings.BIG_FILE
fsize = os.path.getsize(sfile)

with open(sfile, "r") as fh:
    chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_THREADS_CHUNKS)

q = Queue()
pattern = re.compile(settings.TARGET_USERNAME)

# consumer
# Use write_lines if you want an report of matches
#con = threading.Thread(target=write_lines, args=(q, fh_out))
con = threading.Thread(target=count_matches, args=(q, ))
con.daemon = True
con.start()

# producer
producers = []
file_handles = []
for chunk in chunks:
        recsmatch += matches

    print(recsmatch)


# Start Execution
if len(sys.argv) < 1:
    print("usage: %prog")
    sys.exit(1)

sfile = settings.BIG_FILE

fsize = os.path.getsize(sfile)
with open(sfile, "r") as fh:
    chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_GEVENT_CHUNKS)

pattern = re.compile(settings.TARGET_USERNAME)

# maxsize = 0 makes the queue act like a channel.  The queue will block
# until a get call retrieves the data.  In effect, it works like a CSP.
q = gevent.queue.Queue(maxsize=0)

# consumer
con = gevent.spawn(count_matches, q)

# producer
fhandles = [
    open(sfile, "r") for i in xrange(0, settings.BIGFILE_GEVENT_CHUNKS)
]
jobs = [
else:    
    from Queue import Queue

import settings
from bigfile.bigfile import size_chunks, chunk_end, find, count_matches

# Start Execution
if len(sys.argv) < 1:
    print("usage: bigfile_chunks")
    sys.exit(1)
    
sfile = settings.BIG_FILE
fsize = os.path.getsize(sfile)
        
with open(sfile, "r") as fh:
    chunks = size_chunks(fh, fsize, num_chunks=settings.BIGFILE_THREADS_CHUNKS)

q = Queue()
pattern = re.compile(settings.TARGET_USERNAME)

# consumer
# Use write_lines if you want an report of matches
#con = threading.Thread(target=write_lines, args=(q, fh_out))
con = threading.Thread(target=count_matches, args=(q,))
con.daemon = True
con.start()

# producer
producers = []
file_handles = []
for chunk in chunks: