def first():
     x = q.get()
     time.sleep(1)
     r.put(x)
     print_box(f"first: {x = } (reliably always before queues join)")
     # And this is the correct alternative. How to think about it
     # intuitively, so that you don't make the mistake in a larger,
     # more complicated program?
     #
     # Conceptually, the "task" here is not only to get a value from
     # 'q', but to also do some stuff with it (well, sleep, in our
     # case) and then *hand it off to 'r'*. So we can't consider the
     # task "done" until the hand-off has actually happened. At that
     # point, we've passed 'x' to the next part of the pipeline, so
     # we can unblock this one, as we know 'r' will take over
     # blocking until that next part is done too.
     #
     # Visually, there needs to be an inter(b)locking (see what I did
     # there?) pattern between the different parts of the pipeline:
     #
     # ┌    join on thread 1 blocks
     # │
     # │ ┐  join on thread 2 blocks
     # │ │
     # └ │  join on thread 1 unblocks
     #   │
     #   ┘  join on thread 2 unblocks
     q.task_done()
 def second():
     # Similar comments apply as in the first thread: '.task_done()'
     # is called too soon, so the joins can unblock, even though this
     # thread is nowhere near done yet. The sleep again serves to
     # exacerbate the issue.
     x = r.get()
     r.task_done()
     time.sleep(1)
     print_box(f"second: {x = } (potentially after queues have joined)")
 def first():
     x = q.get()
     # Here, we call '.task_done()' as soon as we've received the
     # value from 'q'. This means that past this point, both 'q' and
     # 'r' will be empty with no pending tasks, which means both
     # calls to '.join()' below can unblock -- even though 'x' is
     # nowhere near having made it through the entire pipeline.
     q.task_done()
     # The sleep simulates some more work that this thread has to
     # perform on 'x' before sending it on to the second thread. It
     # exacerbates the issue caused by the wrong placement of the
     # '.task_done()' call -- it makes it less likely that the
     # outcome will vary depending on how the threads happen to get
     # scheduled on each particular run, making the results more
     # predictable.
     time.sleep(1)
     print_box(f"first: {x = } (potentially after queues have joined)")
     r.put(x)
 def second():
     x = r.get()
     time.sleep(1)
     print_box(f"second: {x = } (reliably always before queues join)")
     r.task_done()
Beispiel #5
0
def main():
    print_box("wait for a few jobs to complete, then press Ctrl-C")

    # We only need to hang onto the join handle of the producer thread,
    # as it's the only one we need to join at a specific point in the
    # execution of our program.
    pt = threading.Thread(target=producer)
    pt.start()
    for _ in range(CONSUMER_WORKERS):
        threading.Thread(target=consumer, daemon=True).start()
    threading.Thread(target=logger, daemon=True).start()

    # The job of the main thread now is to just wait for a signal that
    # the application should exit. That's why we've moved the logging to
    # a different thread, because the signal would terminate that loop,
    # whereas we actually want the logging to continue as the program is
    # winding down in an orderly fashion (which is orchestrated by the
    # main thread, see below).
    try:
        threading.Event().wait()
    except KeyboardInterrupt:
        print_box("caught keyboard interrupt")

    # Technically, prints in the main thread are now subject to a race
    # condition, because we're also printing from the logger thread at
    # the same time. Stdout is a resource that should be managed by a
    # single thread, see Hettinger's talk on concurrency. The stdout
    # buffer is not threadsafe and you can easily get garbled output
    # when multiple threads try printing at the same time, or worse, see
    # <https://stackoverflow.com/q/40356200>.
    #
    # But occasionally seeing that garbled output (the boxes of messages
    # printed with 'print_box()' can get interspersed with lines from
    # the logger thread) is a good reminder that you shouldn't do this,
    # so let's leave it as is.
    #
    # NOTE: In practice, you could use the logging module instead, which
    # is threadsafe.

    # Tell the producer thread to stop accepting/submitting new tasks to
    # the 'INPUTQ'.
    print_box("signaling producer to stop producing")
    EXIT.set()

    # Now, we need to wait for the producer to actually exit. This
    # matters because the next step will be to wait for the queues to
    # empty, which only makes sense if we know there aren't any more
    # tasks incoming.
    #
    # What would happen without this '.join()'? Imagine our producer is
    # (potentially) slow at creating tasks. Maybe they're coming in over
    # the network, which involves wait times. It gets a signal to exit
    # while it's waiting for a task which will eventually become the
    # final one. In the meantime, the consumer and logger process the
    # contents of the queues, emptying them out, and as soon as the
    # queues are empty, the program is poised to exit, but it's actually
    # still blocked by that producer thread (which is non-daemon and
    # still running). Eventually, the final task gets submitted to the
    # 'INPUTQ', but the trouble is that it might not get processed,
    # because the consumer and logger daemon threads might get killed
    # before they get a chance to do so, as there's nothing stopping the
    # program from exiting now.
    #
    # So that's why it's a good idea to check that the producer is done
    # at this point, before waiting on the queues to be emptied out.
    print_box("making sure producer is done producing")
    pt.join()

    # Our trusty daemon threads are continuing to spin, handling tasks
    # as they get them via the two queues. Given enough time, they'll
    # get to the end of our unbounded stream of input (the stream is
    # unbounded in theory, but we've just told the producer to stop
    # producing, so it has an end in practice).
    #
    # Imagine the queues as pipes, the producer as a faucet, stdout as
    # the basin where all the logging messages a.k.a. water is supposed
    # to end up. If you close the faucet, the rest of the water will
    # eventually trickle down to the basin and the pipes will end up
    # empty, unless you obliterate the pipes before it's had a chance to
    # do so.
    #
    # Since our threads are daemon threads, Python is happy to
    # obliterate them whenever, even if they're in the middle of doing
    # something. That's not what we want. Luckily, our pipes have
    # '.join()' methods, which will make our program wait until they're
    # emptied out.
    #
    # It's important to realize that it's the loops in the two
    # daemon threads that ensure the water gets drained out, not the
    # calls to '.join()' themselves. Their purpose is just to say, don't
    # move past this point unless both queues are empty, and we know
    # they'll both empty out eventually.
    #
    # Still, the ordering of those '.join()' calls matters: they should
    # be in the same order as data flows through the pipeline.
    # Intuitively, in order to be sure that a later pipe is empty *for
    # good* (as opposed to just happening to be empty *at the moment*),
    # you need to be sure that there isn't any more stuff incoming
    # through the earlier stages of the pipeline. This is also discussed
    # in more detail in 10-task_done_placement.py, which is even more of
    # a toy example so that you can switch joins around and observe how
    # that affects behavior.
    print_box("waiting for queues to empty")
    INPUTQ.join()
    OUTPUTQ.join()

    # Finally, it's conceivable that you might want the consumer and/or
    # logger threads to perform some more complicated cleanup before
    # exiting. How to achieve that?
    #
    # Well in such a case, you wouldn't make them daemon threads, they'd
    # be regular threads, and instead of a 'while True' loop, they'd
    # also have an event as their loop condition -- a separate one from
    # the one used by the producer thread, though! And this is the point
    # where you'd '.set()' that event -- after both queues have been
    # emptied, you can exit the threads' processing loop and perform any
    # required cleanup.
    #
    # You could then leave it up to Python to join those threads as it's
    # exiting, because nothing else really happens beyond this point, so
    # we don't really care at which exact point the threads terminate.
    # Which means we don't have to hang onto their join handles when we
    # create them at the beginning of main, and explicitly call
    # '.join()' on them, unlike for the producer thread, whose shutdown
    # must happen before waiting for the queues to empty, as explained
    # above.

    print_box("exiting gracefully")
Beispiel #6
0
def producer():
    job_id = 0
    while not EXIT.is_set():
        INPUTQ.put((job_id, random.randint(0, 3)))
        job_id += 1
    print_box("exiting producer")