def test_threadsafe_resumable(self, tmpdir): log = defaultdict(list) def listener(name, row): log[name].append(list(row)) def job(payload): i, row = payload s = int(row[2]) time.sleep(s * .1) return i, row output_path = str(tmpdir.join('./enriched_resumable_threadsafe.csv')) resumer = ThreadSafeResumer(output_path, listener=listener) with open('./test/resources/people_unordered.csv') as f, resumer: enricher = casanova.threadsafe_enricher( f, resumer, add=('x2',), keep=('name',) ) for j, (i, row) in enumerate(imap_unordered(enricher, job, 3)): enricher.writerow(i, row, [(i + 1) * 2]) if j == 1: break def sort_output(o): return sorted(tuple(i) for i in o) assert sort_output(collect_csv(output_path)) == sort_output([ ['name', 'index', 'x2'], ['Mary', '1', '4'], ['Julia', '2', '6'] ]) with open('./test/resources/people_unordered.csv') as f, resumer: enricher = casanova.threadsafe_enricher( f, resumer, add=('x2',), keep=('name',) ) for j, (i, row) in enumerate(imap_unordered(enricher, job, 3)): enricher.writerow(i, row, [(i + 1) * 2]) assert sort_output(collect_csv(output_path)) == sort_output([ ['name', 'index', 'x2'], ['Mary', '1', '4'], ['Julia', '2', '6'], ['John', '0', '2'] ]) assert sort_output(log['output.row']) == sort_output([['Mary', '1', '4'], ['Julia', '2', '6']]) assert sort_output(log['filter.row']) == sort_output([[1, ['Mary', 'Sue', '1']], [2, ['Julia', 'Stone', '2']]])
def test_group_parallelism(self): # Unordered results = list( imap_unordered(DATA, sleeper, 2, parallelism=1, key=itemgetter(0))) assert set(results) == set(DATA) results = list( imap_unordered(DATA, sleeper, 2, parallelism=1, key=itemgetter(0), buffer_size=3)) assert set(results) == set(DATA) results = list( imap_unordered(DATA, sleeper, 2, parallelism=1, key=itemgetter(0), buffer_size=1)) assert set(results) == set(DATA) results = list( imap_unordered(DATA, sleeper, 4, parallelism=3, key=itemgetter(0), buffer_size=3)) assert set(results) == set(DATA) # Ordered results = list(imap(DATA, sleeper, 2, parallelism=1, key=itemgetter(0))) assert results == DATA results = list( imap(DATA, sleeper, 2, parallelism=1, key=itemgetter(0), buffer_size=3)) assert results == DATA results = list( imap(DATA, sleeper, 4, parallelism=3, key=itemgetter(0), buffer_size=3)) assert results == DATA
def test_throttle(self): group = lambda x: 'SAME' nbs = set( imap_unordered(range(10), identity, 10, key=group, throttle=0.01)) assert nbs == set(range(10)) nbs = set( imap_unordered(range(10), identity, 10, key=group, throttle=0.01, buffer_size=1)) assert nbs == set(range(10)) nbs = set( imap_unordered(range(10), identity, 10, key=group, throttle=0.01, buffer_size=3)) assert nbs == set(range(10)) nbs = list(imap(range(10), identity, 10, key=group, throttle=0.01)) assert nbs == list(range(10)) nbs = list( imap(range(10), identity, 10, key=group, throttle=0.01, buffer_size=1)) assert nbs == list(range(10)) nbs = list( imap(range(10), identity, 10, key=group, throttle=0.01, buffer_size=3)) assert nbs == list(range(10)) results = list( imap_unordered(DATA, sleeper, 4, key=itemgetter(0), throttle=0.01)) assert set(results) == set(DATA) results = list(imap(DATA, sleeper, 4, key=itemgetter(0), throttle=0.01)) assert results == DATA
def test_callable_throttle(self): def throttling(group, nb, result): assert nb == result if group == 'odd': return 0 return 0.1 group = lambda x: 'even' if x % 2 == 0 else 'odd' nbs = set(imap(range(10), identity, 10, key=group, throttle=throttling)) assert nbs == set(range(10)) def hellraiser(g, i, result): if i > 2: raise TypeError return 0.01 with pytest.raises(TypeError): list( imap_unordered(range(5), identity, 4, key=group, throttle=hellraiser)) def wrong_type(g, i, result): return 'test' with pytest.raises(TypeError): list( imap_unordered(range(5), identity, 2, key=group, throttle=wrong_type)) def negative(g, i, result): return -30 with pytest.raises(TypeError): list( imap_unordered(range(5), identity, 2, key=group, throttle=negative))
def test_threadsafe(self, tmpdir): def job(payload): i, row = payload s = int(row[2]) time.sleep(s * .01) return i, row output_path = str(tmpdir.join('./enriched_resumable_threadsafe.csv')) with open('./test/resources/people_unordered.csv') as f, \ open(output_path, 'w', newline='') as of: enricher = casanova.threadsafe_enricher( f, of, add=('x2',), keep=('name',) ) for i, row in imap_unordered(enricher, job, 3): enricher.writerow(i, row, [(i + 1) * 2]) def sort_output(o): return sorted(tuple(i) for i in o) assert sort_output(collect_csv(output_path)) == sort_output([ ['name', 'index', 'x2'], ['Mary', '1', '4'], ['Julia', '2', '6'], ['John', '0', '2'] ])
def __iter__(self): self.start() queue_iterator = QueueIterator(self.queue) task_context = TaskContext(self.queue, queue_iterator) multithreaded_iterator = imap_unordered( queue_iterator, self.work, self.threads, group=CrawlJob.grouper, group_parallelism=DEFAULT_GROUP_PARALLELISM, group_buffer_size=self.buffer_size, group_throttle=self.throttle ) def generator(): for result in multithreaded_iterator: with task_context: yield result self.cleanup() return generator()
def test_key_raise(self): def group(i): if i > 2: raise RuntimeError return 'SAME' with pytest.raises(RuntimeError): list(imap_unordered(range(5), identity, 2, key=group))
def test_break(self): for i in imap(enumerate(DATA), enumerated_sleeper, 5): if i == 2: break results = list(imap_unordered(DATA, sleeper, 2)) assert len(results) == len(DATA) assert set(results) == set(DATA)
def __iter__(self): self.start() multithreaded_iterator = imap_unordered( self.queue, self.work, self.threads, key=CrawlJob.grouper, parallelism=DEFAULT_DOMAIN_PARALLELISM, buffer_size=self.buffer_size, throttle=self.throttle, join=self.join, daemonic=self.daemonic ) def generator(): for result in multithreaded_iterator: yield result self.queue.task_done() self.cleanup() return generator()
def multithreaded_resolve(iterator, key=None, resolve_args=None, threads=25, throttle=DEFAULT_THROTTLE, max_redirects=5, follow_refresh_header=True, follow_meta_refresh=False, follow_js_relocation=False, buffer_size=DEFAULT_GROUP_BUFFER_SIZE, insecure=False, timeout=None): """ Function returning a multithreaded iterator over resolved urls. Args: iterator (iterable): An iterator over urls or arbitrary items. key (callable, optional): Function extracting url from yielded items. resolve_args (callable, optional): Function returning specific arguments to pass to the resolve util per yielded item. threads (int, optional): Number of threads to use. Defaults to 25. throttle (float or callable, optional): Per-domain throttle in seconds. Or a function taking domain name and item and returning the throttle to apply. Defaults to 0.2. max_redirects (int, optional): Max number of redirections to follow. follow_refresh_header (bool, optional): Whether to follow refresh headers. Defaults to True. follow_meta_refresh (bool, optional): Whether to follow meta refresh. Defaults to False. buffer_size (int, optional): Max number of items per domain to enqueue into memory in hope of finding a new domain that can be processed immediately. Defaults to 1. insecure (bool, optional): Whether to ignore SSL certification errors when performing requests. Defaults to False. timeout (float or urllib3.Timeout, optional): Custom timeout for every request. Yields: ResolveWorkerResult """ # Creating the http pool manager http = create_pool(threads=threads, insecure=insecure, timeout=timeout) # Thread worker def worker(payload): http, item, url = payload if url is None: return ResolveWorkerResult(url=None, item=item, error=None, stack=None) kwargs = resolve_args(url, item) if resolve_args is not None else {} error, stack = resolve(http, url, max_redirects=max_redirects, follow_refresh_header=follow_refresh_header, follow_meta_refresh=follow_meta_refresh, follow_js_relocation=follow_js_relocation, **kwargs) return ResolveWorkerResult(url=url, item=item, error=error, stack=stack) # Group resolver def grouper(payload): if payload.url is None: return return get_domain_name(payload.url) # Thread payload iterator def payloads(): for item in iterator: url = item if key is None else key(item) if not url: yield FetchWorkerPayload(http=http, item=item, url=None) continue # Url cleanup url = ensure_protocol(url.strip()) yield FetchWorkerPayload(http=http, item=item, url=url) return imap_unordered(payloads(), worker, threads, group=grouper, group_parallelism=DEFAULT_GROUP_PARALLELISM, group_buffer_size=buffer_size, group_throttle=throttle)
('B', 0.1, 7), ('B', 0.4, 8), ('B', 0.1, 9), ] BUFFER_SIZE_DATA = [('B', 0.4, 0), ('B', 0.4, 1), ('B', 0.4, 2), ('C', 0.2, 3), ('D', 0.1, 4), ('E', 0.3, 5)] def sleeper(job): time.sleep(job[1]) return job print('2 threads') for result in imap_unordered(DATA, sleeper, 2): print(result) print() print('10 threads / homogeneous (result should be ordered by sleep time)') for result in imap_unordered(HOMEGENEOUS_DATA, sleeper, 10): print(result) print() print('10 threads / 1 parallelism / homogeneous (jobs processed sequentially)') for result in imap_unordered(HOMEGENEOUS_DATA, sleeper, 10, key=itemgetter(0), parallelism=1): print(result)
import time import threading from quenouille import imap_unordered DATA = [ 'A', 'A', 'B', 'C', 'D', 'E', 'F', 'C', 'E', 'G', 'H', 'A', 'J', 'L', 'A', 'A' ] def sleeper(job): time.sleep(1) return job def grouper(job): return job for x in imap_unordered(DATA, sleeper, threads=50, group=grouper, group_parallelism=1): print(x)
def test_empty(self): results = list(imap_unordered(iter([]), sleeper, 5)) assert results == []
def test_one_item(self): results = list(imap_unordered(DATA[:1], sleeper, 2)) assert results == [('A', 0.3, 0)]
def test_less_jobs_than_threads(self): results = list(imap_unordered(DATA[:2], sleeper, 2)) assert set(results) == set([('A', 0.2, 1), ('A', 0.3, 0)])
import time from quenouille import imap_unordered from operator import itemgetter TASKS = [('A', 1), ('A', 1), ('A', 1), ('B', 1), ('C', 2)] def worker(payload): time.sleep(payload[1]) return payload iterator = imap_unordered(TASKS, worker, 2, group=itemgetter(0), group_buffer_size=0, group_parallelism=1) for g, t in iterator: print(g, t)
import time import threading from quenouille import imap_unordered def sleeper(job): time.sleep(job) return job for i in imap_unordered(iter([]), sleeper, 5): print('THIS SHOULD NOT PRINT!') print('Finished with %i threads.' % threading.active_count()) time.sleep(3) print('Afterwards %i threads.' % threading.active_count())
('B', 0.1, 21), ('B', 0.4, 22), ('B', 0.1, 23), ] def sleeper(job): time.sleep(job[1] * 10) return job print('Linear') t = time.time() for result in imap_unordered(LINEAR_DATA, sleeper, 3, throttle=1, key=itemgetter(0)): n = time.time() print(result, n - t) t = n print() print('Unordered') t = time.time() for result in imap_unordered(HOMEGENEOUS_DATA, sleeper, 3, throttle=5, key=itemgetter(0)): n = time.time()
# ============================================================================= # Quenouille Stack Overflow Testing # ============================================================================= # # Reproducing issues related to recursion & stack overflow. # from quenouille import imap_unordered DATA = range(3000) def worker(i): return i for i in imap_unordered(DATA, worker, 25, group=lambda x: 1, group_parallelism=1, group_throttle=0.1): print(i)
def threaded_work(url): if not hasattr(local_data, 'context'): context = ThreadContext(ENDPOINT) local_data.context = context context = local_data.context return context.run_until_complete(work(url)) def dummy_work(url): return url for title in imap_unordered(URLS, threaded_work, 3): print(title) def cleanup(): global local_data global CONTEXT_POOL print('cleanup') for context in CONTEXT_POOL: context.run_until_complete(context.browser.disconnect()) context.loop.close() del context del local_data del CONTEXT_POOL
import time from queue import Queue from quenouille import imap_unordered, QueueIterator queue = Queue() queue.put(2) queue.put(1) queue.put(3) def worker(payload): time.sleep(payload[1]) return payload iterator = QueueIterator(queue) for i, t in imap_unordered(enumerate(iterator), worker, 1): with iterator: print('Done waiting %i' % t) if i < 1: queue.put(4)
def test_arguments(self): with pytest.raises(TypeError): imap_unordered(None, sleeper, 4) with pytest.raises(TypeError): imap_unordered(DATA, 'test', 4) with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 'test') with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 4, key='test') with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 4, parallelism=-1, key=itemgetter(0)) with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 4, parallelism=1, key=itemgetter(0), buffer_size='test') with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 4, parallelism=1, buffer_size=0) # with pytest.raises(TypeError): # imap_unordered(DATA, sleeper, 2, parallelism=4, key=itemgetter(0)) with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 2, key=itemgetter(0), throttle='test') with pytest.raises(TypeError): imap_unordered(DATA, sleeper, 2, key=itemgetter(0), throttle=-4) with pytest.raises(RuntimeError): with ThreadPoolExecutor(4) as executor: pass executor.imap(DATA, sleeper) with pytest.raises(RuntimeError): with ThreadPoolExecutor(4) as executor: def work(item): executor.imap_unordered(DATA, sleeper) list(executor.imap(DATA, work)) with pytest.raises(TypeError): imap(DATA, sleeper, 2, join='test') with pytest.raises(TypeError): imap(DATA, sleeper, 2, daemonic='test')
def multithreaded_fetch(iterator, key=None, request_args=None, threads=25, throttle=DEFAULT_THROTTLE, guess_extension=True, guess_encoding=True, buffer_size=DEFAULT_GROUP_BUFFER_SIZE, insecure=False, timeout=None, domain_parallelism=DEFAULT_GROUP_PARALLELISM): """ Function returning a multithreaded iterator over fetched urls. Args: iterator (iterable): An iterator over urls or arbitrary items. key (callable, optional): Function extracting url from yielded items. request_args (callable, optional): Function returning specific arguments to pass to the request util per yielded item. threads (int, optional): Number of threads to use. Defaults to 25. throttle (float or callable, optional): Per-domain throttle in seconds. Or a function taking domain name and item and returning the throttle to apply. Defaults to 0.2. guess_extension (bool, optional): Attempt to guess the resource's extension? Defaults to True. guess_encoding (bool, optional): Attempt to guess the resource's encoding? Defaults to True. domain_parallelism (int, optional): Max number of urls per domain to hit at the same time. Defaults to 1. buffer_size (int, optional): Max number of items per domain to enqueue into memory in hope of finding a new domain that can be processed immediately. Defaults to 1. insecure (bool, optional): Whether to ignore SSL certification errors when performing requests. Defaults to False. timeout (float or urllib3.Timeout, optional): Custom timeout for every request. Yields: FetchWorkerResult """ # Creating the http pool manager http = create_pool(threads=threads, insecure=insecure, timeout=timeout) # Thread worker def worker(payload): http, item, url = payload if url is None: return FetchWorkerResult(url=None, item=item, response=None, error=None, meta=None) kwargs = request_args(url, item) if request_args is not None else {} error, response = request(http, url, **kwargs) if error: return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=None) # Forcing urllib3 to read data in thread data = response.data # Meta meta = extract_response_meta(response, guess_encoding=guess_encoding, guess_extension=guess_extension) return FetchWorkerResult(url=url, item=item, response=response, error=error, meta=meta) # Group resolver def grouper(payload): if payload.url is None: return return get_domain_name(payload.url) # Thread payload iterator def payloads(): for item in iterator: url = item if key is None else key(item) if not url: yield FetchWorkerPayload(http=http, item=item, url=None) continue # Url cleanup url = ensure_protocol(url.strip()) yield FetchWorkerPayload(http=http, item=item, url=url) return imap_unordered(payloads(), worker, threads, group=grouper, group_parallelism=domain_parallelism, group_buffer_size=buffer_size, group_throttle=throttle)
def test_basics(self): results = list(imap_unordered(DATA, sleeper, 2)) assert len(results) == len(DATA) assert set(results) == set(DATA)
def test_none_iterator(self): iterable = [None] * 3 results = list(imap_unordered(iterable, identity, 2)) assert results == iterable
# ============================================================================= # Quenouille Exception Testing # ============================================================================= # # Testing what happens when exceptions are thrown. # from quenouille import imap_unordered def crasher(i): if i > 7: raise Exception('Die!') return i for result in imap_unordered(range(15), crasher, 3): print(result)
import time from operator import itemgetter from quenouille import imap_unordered, NamedLocks DATA = [('A', 1), ('B', 2), ('B', 3)] locks = NamedLocks() def worker(t): lock = locks[t[0]] print(t, lock) with lock: time.sleep(3) return t print('start') for i in imap_unordered(DATA, worker, 5): print(i) print(locks)