Beispiel #1
0
    def test_threadsafe_resumable(self, tmpdir):
        log = defaultdict(list)

        def listener(name, row):
            log[name].append(list(row))

        def job(payload):
            i, row = payload
            s = int(row[2])
            time.sleep(s * .1)

            return i, row

        output_path = str(tmpdir.join('./enriched_resumable_threadsafe.csv'))

        resumer = ThreadSafeResumer(output_path, listener=listener)

        with open('./test/resources/people_unordered.csv') as f, resumer:

            enricher = casanova.threadsafe_enricher(
                f, resumer,
                add=('x2',),
                keep=('name',)
            )

            for j, (i, row) in enumerate(imap_unordered(enricher, job, 3)):
                enricher.writerow(i, row, [(i + 1) * 2])

                if j == 1:
                    break

        def sort_output(o):
            return sorted(tuple(i) for i in o)

        assert sort_output(collect_csv(output_path)) == sort_output([
            ['name', 'index', 'x2'],
            ['Mary', '1', '4'],
            ['Julia', '2', '6']
        ])

        with open('./test/resources/people_unordered.csv') as f, resumer:

            enricher = casanova.threadsafe_enricher(
                f, resumer,
                add=('x2',),
                keep=('name',)
            )

            for j, (i, row) in enumerate(imap_unordered(enricher, job, 3)):
                enricher.writerow(i, row, [(i + 1) * 2])

        assert sort_output(collect_csv(output_path)) == sort_output([
            ['name', 'index', 'x2'],
            ['Mary', '1', '4'],
            ['Julia', '2', '6'],
            ['John', '0', '2']
        ])

        assert sort_output(log['output.row']) == sort_output([['Mary', '1', '4'], ['Julia', '2', '6']])
        assert sort_output(log['filter.row']) == sort_output([[1, ['Mary', 'Sue', '1']], [2, ['Julia', 'Stone', '2']]])
Beispiel #2
0
    def test_group_parallelism(self):

        # Unordered
        results = list(
            imap_unordered(DATA, sleeper, 2, parallelism=1, key=itemgetter(0)))
        assert set(results) == set(DATA)

        results = list(
            imap_unordered(DATA,
                           sleeper,
                           2,
                           parallelism=1,
                           key=itemgetter(0),
                           buffer_size=3))
        assert set(results) == set(DATA)

        results = list(
            imap_unordered(DATA,
                           sleeper,
                           2,
                           parallelism=1,
                           key=itemgetter(0),
                           buffer_size=1))
        assert set(results) == set(DATA)

        results = list(
            imap_unordered(DATA,
                           sleeper,
                           4,
                           parallelism=3,
                           key=itemgetter(0),
                           buffer_size=3))
        assert set(results) == set(DATA)

        # Ordered
        results = list(imap(DATA, sleeper, 2, parallelism=1,
                            key=itemgetter(0)))
        assert results == DATA

        results = list(
            imap(DATA,
                 sleeper,
                 2,
                 parallelism=1,
                 key=itemgetter(0),
                 buffer_size=3))
        assert results == DATA

        results = list(
            imap(DATA,
                 sleeper,
                 4,
                 parallelism=3,
                 key=itemgetter(0),
                 buffer_size=3))
        assert results == DATA
Beispiel #3
0
    def test_throttle(self):

        group = lambda x: 'SAME'

        nbs = set(
            imap_unordered(range(10), identity, 10, key=group, throttle=0.01))
        assert nbs == set(range(10))

        nbs = set(
            imap_unordered(range(10),
                           identity,
                           10,
                           key=group,
                           throttle=0.01,
                           buffer_size=1))
        assert nbs == set(range(10))

        nbs = set(
            imap_unordered(range(10),
                           identity,
                           10,
                           key=group,
                           throttle=0.01,
                           buffer_size=3))
        assert nbs == set(range(10))

        nbs = list(imap(range(10), identity, 10, key=group, throttle=0.01))
        assert nbs == list(range(10))

        nbs = list(
            imap(range(10),
                 identity,
                 10,
                 key=group,
                 throttle=0.01,
                 buffer_size=1))
        assert nbs == list(range(10))

        nbs = list(
            imap(range(10),
                 identity,
                 10,
                 key=group,
                 throttle=0.01,
                 buffer_size=3))
        assert nbs == list(range(10))

        results = list(
            imap_unordered(DATA, sleeper, 4, key=itemgetter(0), throttle=0.01))
        assert set(results) == set(DATA)

        results = list(imap(DATA, sleeper, 4, key=itemgetter(0),
                            throttle=0.01))
        assert results == DATA
Beispiel #4
0
    def test_callable_throttle(self):
        def throttling(group, nb, result):
            assert nb == result

            if group == 'odd':
                return 0

            return 0.1

        group = lambda x: 'even' if x % 2 == 0 else 'odd'

        nbs = set(imap(range(10), identity, 10, key=group,
                       throttle=throttling))

        assert nbs == set(range(10))

        def hellraiser(g, i, result):
            if i > 2:
                raise TypeError

            return 0.01

        with pytest.raises(TypeError):
            list(
                imap_unordered(range(5),
                               identity,
                               4,
                               key=group,
                               throttle=hellraiser))

        def wrong_type(g, i, result):
            return 'test'

        with pytest.raises(TypeError):
            list(
                imap_unordered(range(5),
                               identity,
                               2,
                               key=group,
                               throttle=wrong_type))

        def negative(g, i, result):
            return -30

        with pytest.raises(TypeError):
            list(
                imap_unordered(range(5),
                               identity,
                               2,
                               key=group,
                               throttle=negative))
Beispiel #5
0
    def test_threadsafe(self, tmpdir):
        def job(payload):
            i, row = payload
            s = int(row[2])
            time.sleep(s * .01)

            return i, row

        output_path = str(tmpdir.join('./enriched_resumable_threadsafe.csv'))
        with open('./test/resources/people_unordered.csv') as f, \
             open(output_path, 'w', newline='') as of:

            enricher = casanova.threadsafe_enricher(
                f, of,
                add=('x2',),
                keep=('name',)
            )

            for i, row in imap_unordered(enricher, job, 3):
                enricher.writerow(i, row, [(i + 1) * 2])

        def sort_output(o):
            return sorted(tuple(i) for i in o)

        assert sort_output(collect_csv(output_path)) == sort_output([
            ['name', 'index', 'x2'],
            ['Mary', '1', '4'],
            ['Julia', '2', '6'],
            ['John', '0', '2']
        ])
Beispiel #6
0
    def __iter__(self):

        self.start()

        queue_iterator = QueueIterator(self.queue)
        task_context = TaskContext(self.queue, queue_iterator)

        multithreaded_iterator = imap_unordered(
            queue_iterator,
            self.work,
            self.threads,
            group=CrawlJob.grouper,
            group_parallelism=DEFAULT_GROUP_PARALLELISM,
            group_buffer_size=self.buffer_size,
            group_throttle=self.throttle
        )

        def generator():
            for result in multithreaded_iterator:
                with task_context:
                    yield result

            self.cleanup()

        return generator()
Beispiel #7
0
    def test_key_raise(self):
        def group(i):
            if i > 2:
                raise RuntimeError

            return 'SAME'

        with pytest.raises(RuntimeError):
            list(imap_unordered(range(5), identity, 2, key=group))
Beispiel #8
0
    def test_break(self):

        for i in imap(enumerate(DATA), enumerated_sleeper, 5):
            if i == 2:
                break

        results = list(imap_unordered(DATA, sleeper, 2))

        assert len(results) == len(DATA)
        assert set(results) == set(DATA)
Beispiel #9
0
    def __iter__(self):

        self.start()

        multithreaded_iterator = imap_unordered(
            self.queue,
            self.work,
            self.threads,
            key=CrawlJob.grouper,
            parallelism=DEFAULT_DOMAIN_PARALLELISM,
            buffer_size=self.buffer_size,
            throttle=self.throttle,
            join=self.join,
            daemonic=self.daemonic
        )

        def generator():
            for result in multithreaded_iterator:
                yield result
                self.queue.task_done()

            self.cleanup()

        return generator()
Beispiel #10
0
def multithreaded_resolve(iterator,
                          key=None,
                          resolve_args=None,
                          threads=25,
                          throttle=DEFAULT_THROTTLE,
                          max_redirects=5,
                          follow_refresh_header=True,
                          follow_meta_refresh=False,
                          follow_js_relocation=False,
                          buffer_size=DEFAULT_GROUP_BUFFER_SIZE,
                          insecure=False,
                          timeout=None):
    """
    Function returning a multithreaded iterator over resolved urls.

    Args:
        iterator (iterable): An iterator over urls or arbitrary items.
        key (callable, optional): Function extracting url from yielded items.
        resolve_args (callable, optional): Function returning specific
            arguments to pass to the resolve util per yielded item.
        threads (int, optional): Number of threads to use. Defaults to 25.
        throttle (float or callable, optional): Per-domain throttle in seconds.
            Or a function taking domain name and item and returning the
            throttle to apply. Defaults to 0.2.
        max_redirects (int, optional): Max number of redirections to follow.
        follow_refresh_header (bool, optional): Whether to follow refresh
            headers. Defaults to True.
        follow_meta_refresh (bool, optional): Whether to follow meta refresh.
            Defaults to False.
        buffer_size (int, optional): Max number of items per domain to enqueue
            into memory in hope of finding a new domain that can be processed
            immediately. Defaults to 1.
        insecure (bool, optional): Whether to ignore SSL certification errors
            when performing requests. Defaults to False.
        timeout (float or urllib3.Timeout, optional): Custom timeout for every
            request.

    Yields:
        ResolveWorkerResult

    """

    # Creating the http pool manager
    http = create_pool(threads=threads, insecure=insecure, timeout=timeout)

    # Thread worker
    def worker(payload):
        http, item, url = payload

        if url is None:
            return ResolveWorkerResult(url=None,
                                       item=item,
                                       error=None,
                                       stack=None)

        kwargs = resolve_args(url, item) if resolve_args is not None else {}

        error, stack = resolve(http,
                               url,
                               max_redirects=max_redirects,
                               follow_refresh_header=follow_refresh_header,
                               follow_meta_refresh=follow_meta_refresh,
                               follow_js_relocation=follow_js_relocation,
                               **kwargs)

        return ResolveWorkerResult(url=url,
                                   item=item,
                                   error=error,
                                   stack=stack)

    # Group resolver
    def grouper(payload):
        if payload.url is None:
            return

        return get_domain_name(payload.url)

    # Thread payload iterator
    def payloads():
        for item in iterator:
            url = item if key is None else key(item)

            if not url:
                yield FetchWorkerPayload(http=http, item=item, url=None)

                continue

            # Url cleanup
            url = ensure_protocol(url.strip())

            yield FetchWorkerPayload(http=http, item=item, url=url)

    return imap_unordered(payloads(),
                          worker,
                          threads,
                          group=grouper,
                          group_parallelism=DEFAULT_GROUP_PARALLELISM,
                          group_buffer_size=buffer_size,
                          group_throttle=throttle)
Beispiel #11
0
    ('B', 0.1, 7),
    ('B', 0.4, 8),
    ('B', 0.1, 9),
]

BUFFER_SIZE_DATA = [('B', 0.4, 0), ('B', 0.4, 1), ('B', 0.4, 2), ('C', 0.2, 3),
                    ('D', 0.1, 4), ('E', 0.3, 5)]


def sleeper(job):
    time.sleep(job[1])
    return job


print('2 threads')
for result in imap_unordered(DATA, sleeper, 2):
    print(result)
print()

print('10 threads / homogeneous (result should be ordered by sleep time)')
for result in imap_unordered(HOMEGENEOUS_DATA, sleeper, 10):
    print(result)
print()

print('10 threads / 1 parallelism / homogeneous (jobs processed sequentially)')
for result in imap_unordered(HOMEGENEOUS_DATA,
                             sleeper,
                             10,
                             key=itemgetter(0),
                             parallelism=1):
    print(result)
Beispiel #12
0
import time
import threading
from quenouille import imap_unordered

DATA = [
    'A', 'A', 'B', 'C', 'D', 'E', 'F', 'C', 'E', 'G', 'H', 'A', 'J', 'L', 'A',
    'A'
]


def sleeper(job):
    time.sleep(1)
    return job


def grouper(job):
    return job


for x in imap_unordered(DATA,
                        sleeper,
                        threads=50,
                        group=grouper,
                        group_parallelism=1):
    print(x)
Beispiel #13
0
    def test_empty(self):
        results = list(imap_unordered(iter([]), sleeper, 5))

        assert results == []
Beispiel #14
0
    def test_one_item(self):
        results = list(imap_unordered(DATA[:1], sleeper, 2))

        assert results == [('A', 0.3, 0)]
Beispiel #15
0
    def test_less_jobs_than_threads(self):

        results = list(imap_unordered(DATA[:2], sleeper, 2))

        assert set(results) == set([('A', 0.2, 1), ('A', 0.3, 0)])
Beispiel #16
0
import time
from quenouille import imap_unordered
from operator import itemgetter

TASKS = [('A', 1), ('A', 1), ('A', 1), ('B', 1), ('C', 2)]


def worker(payload):
    time.sleep(payload[1])

    return payload


iterator = imap_unordered(TASKS,
                          worker,
                          2,
                          group=itemgetter(0),
                          group_buffer_size=0,
                          group_parallelism=1)

for g, t in iterator:
    print(g, t)
Beispiel #17
0
import time
import threading
from quenouille import imap_unordered


def sleeper(job):
    time.sleep(job)
    return job


for i in imap_unordered(iter([]), sleeper, 5):
    print('THIS SHOULD NOT PRINT!')

print('Finished with %i threads.' % threading.active_count())
time.sleep(3)
print('Afterwards %i threads.' % threading.active_count())
Beispiel #18
0
    ('B', 0.1, 21),
    ('B', 0.4, 22),
    ('B', 0.1, 23),
]


def sleeper(job):
    time.sleep(job[1] * 10)
    return job


print('Linear')
t = time.time()
for result in imap_unordered(LINEAR_DATA,
                             sleeper,
                             3,
                             throttle=1,
                             key=itemgetter(0)):
    n = time.time()
    print(result, n - t)
    t = n
print()

print('Unordered')
t = time.time()
for result in imap_unordered(HOMEGENEOUS_DATA,
                             sleeper,
                             3,
                             throttle=5,
                             key=itemgetter(0)):
    n = time.time()
Beispiel #19
0
# =============================================================================
# Quenouille Stack Overflow Testing
# =============================================================================
#
# Reproducing issues related to recursion & stack overflow.
#
from quenouille import imap_unordered

DATA = range(3000)


def worker(i):
    return i


for i in imap_unordered(DATA,
                        worker,
                        25,
                        group=lambda x: 1,
                        group_parallelism=1,
                        group_throttle=0.1):
    print(i)
Beispiel #20
0
def threaded_work(url):
    if not hasattr(local_data, 'context'):
        context = ThreadContext(ENDPOINT)
        local_data.context = context

    context = local_data.context

    return context.run_until_complete(work(url))


def dummy_work(url):
    return url


for title in imap_unordered(URLS, threaded_work, 3):
    print(title)


def cleanup():
    global local_data
    global CONTEXT_POOL

    print('cleanup')
    for context in CONTEXT_POOL:
        context.run_until_complete(context.browser.disconnect())
        context.loop.close()
        del context

    del local_data
    del CONTEXT_POOL
Beispiel #21
0
import time
from queue import Queue
from quenouille import imap_unordered, QueueIterator

queue = Queue()

queue.put(2)
queue.put(1)
queue.put(3)


def worker(payload):
    time.sleep(payload[1])

    return payload


iterator = QueueIterator(queue)

for i, t in imap_unordered(enumerate(iterator), worker, 1):
    with iterator:
        print('Done waiting %i' % t)

        if i < 1:
            queue.put(4)
Beispiel #22
0
    def test_arguments(self):
        with pytest.raises(TypeError):
            imap_unordered(None, sleeper, 4)

        with pytest.raises(TypeError):
            imap_unordered(DATA, 'test', 4)

        with pytest.raises(TypeError):
            imap_unordered(DATA, sleeper, 'test')

        with pytest.raises(TypeError):
            imap_unordered(DATA, sleeper, 4, key='test')

        with pytest.raises(TypeError):
            imap_unordered(DATA, sleeper, 4, parallelism=-1, key=itemgetter(0))

        with pytest.raises(TypeError):
            imap_unordered(DATA,
                           sleeper,
                           4,
                           parallelism=1,
                           key=itemgetter(0),
                           buffer_size='test')

        with pytest.raises(TypeError):
            imap_unordered(DATA, sleeper, 4, parallelism=1, buffer_size=0)

        # with pytest.raises(TypeError):
        #     imap_unordered(DATA, sleeper, 2, parallelism=4, key=itemgetter(0))

        with pytest.raises(TypeError):
            imap_unordered(DATA,
                           sleeper,
                           2,
                           key=itemgetter(0),
                           throttle='test')

        with pytest.raises(TypeError):
            imap_unordered(DATA, sleeper, 2, key=itemgetter(0), throttle=-4)

        with pytest.raises(RuntimeError):
            with ThreadPoolExecutor(4) as executor:
                pass

            executor.imap(DATA, sleeper)

        with pytest.raises(RuntimeError):
            with ThreadPoolExecutor(4) as executor:

                def work(item):
                    executor.imap_unordered(DATA, sleeper)

                list(executor.imap(DATA, work))

        with pytest.raises(TypeError):
            imap(DATA, sleeper, 2, join='test')

        with pytest.raises(TypeError):
            imap(DATA, sleeper, 2, daemonic='test')
Beispiel #23
0
def multithreaded_fetch(iterator,
                        key=None,
                        request_args=None,
                        threads=25,
                        throttle=DEFAULT_THROTTLE,
                        guess_extension=True,
                        guess_encoding=True,
                        buffer_size=DEFAULT_GROUP_BUFFER_SIZE,
                        insecure=False,
                        timeout=None,
                        domain_parallelism=DEFAULT_GROUP_PARALLELISM):
    """
    Function returning a multithreaded iterator over fetched urls.

    Args:
        iterator (iterable): An iterator over urls or arbitrary items.
        key (callable, optional): Function extracting url from yielded items.
        request_args (callable, optional): Function returning specific
            arguments to pass to the request util per yielded item.
        threads (int, optional): Number of threads to use. Defaults to 25.
        throttle (float or callable, optional): Per-domain throttle in seconds.
            Or a function taking domain name and item and returning the
            throttle to apply. Defaults to 0.2.
        guess_extension (bool, optional): Attempt to guess the resource's
            extension? Defaults to True.
        guess_encoding (bool, optional): Attempt to guess the resource's
            encoding? Defaults to True.
        domain_parallelism (int, optional): Max number of urls per domain to
            hit at the same time. Defaults to 1.
        buffer_size (int, optional): Max number of items per domain to enqueue
            into memory in hope of finding a new domain that can be processed
            immediately. Defaults to 1.
        insecure (bool, optional): Whether to ignore SSL certification errors
            when performing requests. Defaults to False.
        timeout (float or urllib3.Timeout, optional): Custom timeout for every
            request.

    Yields:
        FetchWorkerResult

    """

    # Creating the http pool manager
    http = create_pool(threads=threads, insecure=insecure, timeout=timeout)

    # Thread worker
    def worker(payload):
        http, item, url = payload

        if url is None:
            return FetchWorkerResult(url=None,
                                     item=item,
                                     response=None,
                                     error=None,
                                     meta=None)

        kwargs = request_args(url, item) if request_args is not None else {}

        error, response = request(http, url, **kwargs)

        if error:
            return FetchWorkerResult(url=url,
                                     item=item,
                                     response=response,
                                     error=error,
                                     meta=None)

        # Forcing urllib3 to read data in thread
        data = response.data

        # Meta
        meta = extract_response_meta(response,
                                     guess_encoding=guess_encoding,
                                     guess_extension=guess_extension)

        return FetchWorkerResult(url=url,
                                 item=item,
                                 response=response,
                                 error=error,
                                 meta=meta)

    # Group resolver
    def grouper(payload):
        if payload.url is None:
            return

        return get_domain_name(payload.url)

    # Thread payload iterator
    def payloads():
        for item in iterator:
            url = item if key is None else key(item)

            if not url:
                yield FetchWorkerPayload(http=http, item=item, url=None)

                continue

            # Url cleanup
            url = ensure_protocol(url.strip())

            yield FetchWorkerPayload(http=http, item=item, url=url)

    return imap_unordered(payloads(),
                          worker,
                          threads,
                          group=grouper,
                          group_parallelism=domain_parallelism,
                          group_buffer_size=buffer_size,
                          group_throttle=throttle)
Beispiel #24
0
    def test_basics(self):

        results = list(imap_unordered(DATA, sleeper, 2))

        assert len(results) == len(DATA)
        assert set(results) == set(DATA)
Beispiel #25
0
    def test_none_iterator(self):
        iterable = [None] * 3

        results = list(imap_unordered(iterable, identity, 2))
        assert results == iterable
Beispiel #26
0
# =============================================================================
# Quenouille Exception Testing
# =============================================================================
#
# Testing what happens when exceptions are thrown.
#
from quenouille import imap_unordered


def crasher(i):
    if i > 7:
        raise Exception('Die!')
    return i


for result in imap_unordered(range(15), crasher, 3):
    print(result)
Beispiel #27
0
import time
from operator import itemgetter
from quenouille import imap_unordered, NamedLocks

DATA = [('A', 1), ('B', 2), ('B', 3)]

locks = NamedLocks()


def worker(t):
    lock = locks[t[0]]
    print(t, lock)

    with lock:
        time.sleep(3)
        return t


print('start')
for i in imap_unordered(DATA, worker, 5):
    print(i)

print(locks)