def test_cancel_waiting_workers(join_worker_pool): """ If we have a small pool and many workers, it is possible for workers to be enqueued one after another in one thread. We test that if we call `cancel()`, these enqueued workers are cancelled too. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=1, timeout_max=1), 100), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=10) join_worker_pool(pool) t_start = time.monotonic() pool.start() pool.block_until_target_successes() pool.cancel() pool.join() t_end = time.monotonic() # We have 10 threads in the pool and 100 workers that are all enqueued at once at the start. # If we didn't check for the cancel condition, we would have to wait for 10 seconds. # We get 10 successes after 1s and cancel the workers, # but the next workers in each thread have already started, so we have to wait for another 1s. assert t_end - t_start < 2.5
def test_wait_for_successes_out_of_values(join_worker_pool): """ Checks that if there weren't enough successful workers, `block_until_target_successes()` raises an exception when the value factory is exhausted. """ outcomes, worker = generate_workers([ (WorkerRule(timeout_min=0.5, timeout_max=1.5), 9), (WorkerRule(fails=True, timeout_min=0.5, timeout_max=1.5), 20), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=15) join_worker_pool(pool) t_start = time.monotonic() pool.start() with pytest.raises(WorkerPool.OutOfValues): successes = pool.block_until_target_successes() t_end = time.monotonic() # We have roughly 2 workers per thread, so it shouldn't take longer than 1.5s (max timeout) * 2 assert t_end - t_start < 4
def test_wait_for_successes_timed_out(join_worker_pool): """ Checks that if enough successful workers can't finish before the timeout, we get an exception. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=0, timeout_max=0.5), 9), (OperatorRule(timeout_min=1.5, timeout_max=2.5), 1), (OperatorRule(fails=True, timeout_min=1.5, timeout_max=2.5), 20), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) timeout = 1 pool = WorkerPool(worker, factory, target_successes=10, timeout=timeout, threadpool_size=30) join_worker_pool(pool) t_start = time.monotonic() pool.start() with pytest.raises(WorkerPool.TimedOut) as exc_info: successes = pool.block_until_target_successes() t_end = time.monotonic() # Even though timeout is 1, there are long-running workers which we can't interupt. assert t_end - t_start < 3 message = str(exc_info.value) # None of the workers actually failed, they just timed out assert f"Execution timed out after {timeout}s" == message
def test_join(join_worker_pool): """ Test joining the pool. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=0.5, timeout_max=1.5), 9), (OperatorRule(fails=True, timeout_min=0.5, timeout_max=1.5), 20), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=1, threadpool_size=30) join_worker_pool(pool) t_start = time.monotonic() pool.start() pool.join() t_end = time.monotonic() pool.join() # should work the second time too # Even though timeout is 1, there are long-running workers which we can't interupt. assert t_end - t_start < 3
def __init__(self, worker, nodes, percent_to_complete_before_release=5, threadpool_size=120, timeout=20): self._total = len(nodes) self._block_until_this_many_are_complete = math.ceil(len(nodes) * percent_to_complete_before_release / 100) self._worker_pool = WorkerPool(worker=worker, value_factory=AllAtOnceFactory(nodes), target_successes=self._block_until_this_many_are_complete, timeout=timeout, stagger_timeout=0, threadpool_size=threadpool_size)
def get_ursulas( self, quantity: int, duration_periods: int = None, # optional for federated mode exclude_ursulas: Optional[Sequence[ChecksumAddress]] = None, include_ursulas: Optional[Sequence[ChecksumAddress]] = None ) -> List[UrsulaInfo]: reservoir = self._make_staker_reservoir(quantity, duration_periods, exclude_ursulas, include_ursulas) value_factory = PrefetchStrategy(reservoir, quantity) def get_ursula_info(ursula_address) -> Porter.UrsulaInfo: if ursula_address not in self.known_nodes: raise ValueError(f"{ursula_address} is not known") ursula = self.known_nodes[ursula_address] try: # verify node is valid self.network_middleware.client.verify_and_parse_node_or_host_and_port( node_or_sprout=ursula, host=None, port=None) return Porter.UrsulaInfo( checksum_address=ursula_address, uri=f"{ursula.rest_interface.formal_uri}", encrypting_key=ursula.public_keys(DecryptingPower)) except Exception as e: self.log.debug( f"Unable to obtain Ursula information ({ursula_address}): {str(e)}" ) raise self.block_until_number_of_known_nodes_is( quantity, timeout=self.DEFAULT_EXECUTION_TIMEOUT, learn_on_this_thread=True, eager=True) worker_pool = WorkerPool(worker=get_ursula_info, value_factory=value_factory, target_successes=quantity, timeout=self.DEFAULT_EXECUTION_TIMEOUT, stagger_timeout=1, threadpool_size=quantity) worker_pool.start() successes = worker_pool.block_until_target_successes() ursulas_info = successes.values() return list(ursulas_info)
def test_wait_for_successes(join_worker_pool): """ Checks that `block_until_target_successes()` returns in time and gives all the successes, if there were enough of them. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=0.5, timeout_max=1.5), 10), (OperatorRule(fails=True, timeout_min=1, timeout_max=3), 20), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=30) join_worker_pool(pool) t_start = time.monotonic() pool.start() successes = pool.block_until_target_successes() t_end = time.monotonic() failures = pool.get_failures() assert all(outcomes[value].fails for value in failures) assert len(successes) == 10 # We have more threads in the pool than the workers, # so all the successful ones should be able to finish right away. assert t_end - t_start < 2 # Should be able to do it several times successes = pool.block_until_target_successes() assert len(successes) == 10
def test_wait_for_successes_out_of_values(join_worker_pool): """ Checks that if there weren't enough successful workers, `block_until_target_successes()` raises an exception when the value factory is exhausted. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=0.5, timeout_max=1.5), 9), (OperatorRule(fails=True, timeout_min=0.5, timeout_max=1.5), 20), ], seed=123) factory = AllAtOnceFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=15) join_worker_pool(pool) t_start = time.monotonic() pool.start() with pytest.raises(WorkerPool.OutOfValues) as exc_info: successes = pool.block_until_target_successes() t_end = time.monotonic() # We have roughly 2 workers per thread, so it shouldn't take longer than 1.5s (max timeout) * 2 assert t_end - t_start < 4 message = str(exc_info.value) assert "Execution stopped before completion - not enough available values" in message # We had 20 workers set up to fail num_expected_failures = 20 assert f"{num_expected_failures} failures recorded" in message # check tracebacks tracebacks = exc_info.value.get_tracebacks() assert len(tracebacks) == num_expected_failures for value, traceback in tracebacks.items(): assert 'raise Exception(f"Operator for {value} failed")' in traceback assert f'Operator for {value} failed' in traceback # This will be the last line in the displayed traceback; # That's where the worker actually failed. (Operator for {value} failed) assert 'raise Exception(f"Operator for {value} failed")' in message
def test_buggy_factory_raises_on_join(): """ Tests that if there is an exception thrown in the value factory, it is caught in the first call to `join()`. """ outcomes, worker = generate_workers( [(OperatorRule(timeout_min=1, timeout_max=1), 100)], seed=123) factory = BuggyFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=10) pool.start() pool.cancel() with pytest.raises(Exception, match="Buggy factory"): pool.join() with pytest.raises(Exception, match="Buggy factory"): pool.join()
def get_ursulas( self, quantity: int, exclude_ursulas: Optional[Sequence[ChecksumAddress]] = None, include_ursulas: Optional[Sequence[ChecksumAddress]] = None ) -> List[UrsulaInfo]: reservoir = self._make_reservoir(quantity, exclude_ursulas, include_ursulas) value_factory = PrefetchStrategy(reservoir, quantity) def get_ursula_info(ursula_address) -> Porter.UrsulaInfo: if to_checksum_address(ursula_address) not in self.known_nodes: raise ValueError(f"{ursula_address} is not known") ursula_address = to_checksum_address(ursula_address) ursula = self.known_nodes[ursula_address] try: # ensure node is up and reachable self.network_middleware.ping(ursula) return Porter.UrsulaInfo( checksum_address=ursula_address, uri=f"{ursula.rest_interface.formal_uri}", encrypting_key=ursula.public_keys(DecryptingPower)) except Exception as e: self.log.debug( f"Ursula ({ursula_address}) is unreachable: {str(e)}") raise self.block_until_number_of_known_nodes_is( quantity, timeout=self.execution_timeout, learn_on_this_thread=True, eager=True) worker_pool = WorkerPool(worker=get_ursula_info, value_factory=value_factory, target_successes=quantity, timeout=self.execution_timeout, stagger_timeout=1) worker_pool.start() try: successes = worker_pool.block_until_target_successes() finally: worker_pool.cancel() # don't wait for it to stop by "joining" - too slow... ursulas_info = successes.values() return list(ursulas_info)
class TreasureMapPublisher: log = Logger('TreasureMapPublisher') def __init__(self, worker, nodes, percent_to_complete_before_release=5, threadpool_size=120, timeout=20): self._total = len(nodes) self._block_until_this_many_are_complete = math.ceil( len(nodes) * percent_to_complete_before_release / 100) self._worker_pool = WorkerPool( worker=worker, value_factory=AllAtOnceFactory(nodes), target_successes=self._block_until_this_many_are_complete, timeout=timeout, stagger_timeout=0, threadpool_size=threadpool_size) @property def completed(self): # TODO: lock dict before copying? return self._worker_pool.get_successes() def start(self): self.log.info(f"TreasureMapPublisher starting") self._worker_pool.start() if reactor.running: reactor.callInThread(self.block_until_complete) def block_until_success_is_reasonably_likely(self): # Note: `OutOfValues`/`TimedOut` may be raised here, which means we didn't even get to # `percent_to_complete_before_release` successes. For now just letting it fire. self._worker_pool.block_until_target_successes() completed = self.completed self.log.debug( f"The minimal amount of nodes ({len(completed)}) was contacted " "while blocking for treasure map publication.") return completed def block_until_complete(self): self._worker_pool.join()
def _enact_arrangements( self, network_middleware: RestMiddleware, arrangements: Dict[Ursula, Arrangement], publication_transaction: Optional[HexBytes] = None, publish_treasure_map: bool = True, timeout: int = 10, ): """ Attempts to distribute kfrags to Ursulas that accepted arrangements earlier. """ def worker(ursula_and_kfrag): ursula, kfrag = ursula_and_kfrag arrangement = arrangements[ursula] # TODO: seems like it would be enough to just encrypt this with Ursula's public key, # and not create a whole capsule. # Can't change for now since it's node protocol. payload = self._make_enactment_payload(publication_transaction, kfrag) message_kit, _signature = self.alice.encrypt_for(ursula, payload) try: # TODO: Concurrency response = network_middleware.enact_policy( ursula, arrangement.id, message_kit.to_bytes()) except network_middleware.UnexpectedResponse as e: status = e.status else: status = response.status_code return status value_factory = AllAtOnceFactory(list(zip(arrangements, self.kfrags))) worker_pool = WorkerPool(worker=worker, value_factory=value_factory, target_successes=self.n, timeout=timeout, threadpool_size=self.n) worker_pool.start() # Block until everything is complete. We need all the workers to finish. worker_pool.join() successes = worker_pool.get_successes() if len(successes) != self.n: raise Policy.EnactmentError() # TODO: Enable re-tries? statuses = { ursula_and_kfrag[0].checksum_address: status for ursula_and_kfrag, status in successes.items() } if not all(status == 200 for status in statuses.values()): report = "\n".join(f"{address}: {status}" for address, status in statuses.items()) self.log.debug( f"Policy enactment failed. Request statuses:\n{report}") # OK, let's check: if two or more Ursulas claimed we didn't pay, # we need to re-evaulate our situation here. number_of_claims_of_freeloading = sum( status == 402 for status in statuses.values()) # TODO: a better exception here? if number_of_claims_of_freeloading > 2: raise self.alice.NotEnoughNodes # otherwise just raise a more generic error raise Policy.EnactmentError()
def test_buggy_factory_raises_on_block(): """ Tests that if there is an exception thrown in the value factory, it is caught in the first call to `block_until_target_successes()`. """ outcomes, worker = generate_workers( [(OperatorRule(timeout_min=1, timeout_max=1), 100)], seed=123) factory = BuggyFactory(list(outcomes)) # Non-zero stagger timeout to make BuggyFactory raise its error only in 1.5s, # So that we got enough successes for `block_until_target_successes()`. pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=10, stagger_timeout=1.5) pool.start() time.sleep(2) # wait for the stagger timeout to finish with pytest.raises(Exception, match="Buggy factory"): pool.block_until_target_successes() # Further calls to `block_until_target_successes()` or `join()` don't throw the error. with pytest.raises(Exception, match="Buggy factory"): pool.block_until_target_successes() pool.cancel() with pytest.raises(Exception, match="Buggy factory"): pool.join()
def test_buggy_factory_raises_on_block(): """ Tests that if there is an exception thrown in the value factory, it is caught in the first call to `block_until_target_successes()`. """ outcomes, worker = generate_workers( [(OperatorRule(timeout_min=1, timeout_max=1), 100)], seed=123) factory = BuggyFactory(list(outcomes)) # WorkerPool short circuits once it has sufficient successes. Therefore, # the stagger timeout needs to be less than worker timeout, # since BuggyFactory only fails if you do a subsequent batch # Once the subsequent batch is requested, the BuggyFactory returns an error # causing WorkerPool to fail pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=10, stagger_timeout=0.75) pool.start() time.sleep(2) # wait for the stagger timeout to finish with pytest.raises(Exception, match="Buggy factory"): pool.block_until_target_successes() # Further calls to `block_until_target_successes()` or `join()` don't throw the error. with pytest.raises(Exception, match="Buggy factory"): pool.block_until_target_successes() pool.cancel() with pytest.raises(Exception, match="Buggy factory"): pool.join()
def test_batched_value_generation(join_worker_pool): """ Tests a value factory that gives out value batches in portions. """ outcomes, worker = generate_workers( [ (OperatorRule(timeout_min=0.5, timeout_max=1.5), 80), (OperatorRule(fails=True, timeout_min=0.5, timeout_max=1.5), 80), ], seed=123) factory = BatchFactory(list(outcomes)) pool = WorkerPool(worker, factory, target_successes=10, timeout=10, threadpool_size=10, stagger_timeout=0.5) join_worker_pool(pool) t_start = time.monotonic() pool.start() successes = pool.block_until_target_successes() pool.cancel() pool.join() t_end = time.monotonic() assert len(successes) == 10 # Check that batch sizes in the factory were getting progressively smaller # as the number of successes grew. assert all(factory.batch_sizes[i] >= factory.batch_sizes[i+1] for i in range(len(factory.batch_sizes) - 1)) # Since we canceled the pool, no more workers will be started and we will finish faster assert t_end - t_start < 4 successes_copy = pool.get_successes() failures_copy = pool.get_failures() assert all(value in successes_copy for value in successes)
def _sample( self, network_middleware: RestMiddleware, ursulas: Optional[Iterable['Ursula']] = None, timeout: int = 10, ) -> List['Ursula']: """Send concurrent requests to the /ping HTTP endpoint of nodes drawn from the reservoir.""" ursulas = ursulas or [] handpicked_addresses = [ ChecksumAddress(ursula.checksum_address) for ursula in ursulas ] self.publisher.block_until_number_of_known_nodes_is( self.shares, learn_on_this_thread=True, eager=True) reservoir = self._make_reservoir(handpicked_addresses) value_factory = PrefetchStrategy(reservoir, self.shares) def worker(address) -> 'Ursula': return self._ping_node(address, network_middleware) worker_pool = WorkerPool(worker=worker, value_factory=value_factory, target_successes=self.shares, timeout=timeout, stagger_timeout=1, threadpool_size=self.shares) worker_pool.start() try: successes = worker_pool.block_until_target_successes() except (WorkerPool.OutOfValues, WorkerPool.TimedOut): # It's possible to raise some other exceptions here but we will use the logic below. successes = worker_pool.get_successes() finally: worker_pool.cancel() worker_pool.join() failures = worker_pool.get_failures() accepted_addresses = ", ".join(ursula.checksum_address for ursula in successes.values()) if len(successes) < self.shares: rejections = "\n".join( f"{address}: {value}" for address, (type_, value, traceback) in failures.items()) message = "Failed to contact enough sampled nodes.\n"\ f"Selected:\n{accepted_addresses}\n" \ f"Unavailable:\n{rejections}" self.log.debug(message) raise self.NotEnoughUrsulas(message) self.log.debug(f"Selected nodes for policy: {accepted_addresses}") ursulas = list(successes.values()) return ursulas
def _make_arrangements( self, network_middleware: RestMiddleware, handpicked_ursulas: Optional[Iterable[Ursula]] = None, timeout: int = 10, ) -> Dict[Ursula, Arrangement]: """ Pick some Ursula addresses and send them arrangement proposals. Returns a dictionary of Ursulas to Arrangements if it managed to get `n` responses. """ if handpicked_ursulas is None: handpicked_ursulas = [] handpicked_addresses = [ ursula.checksum_address for ursula in handpicked_ursulas ] reservoir = self._make_reservoir(handpicked_addresses) value_factory = PrefetchStrategy(reservoir, self.n) def worker(address): return self._propose_arrangement(address, network_middleware) self.alice.block_until_number_of_known_nodes_is( self.n, learn_on_this_thread=True, eager=True) worker_pool = WorkerPool(worker=worker, value_factory=value_factory, target_successes=self.n, timeout=timeout, stagger_timeout=1, threadpool_size=self.n) worker_pool.start() try: successes = worker_pool.block_until_target_successes() except (WorkerPool.OutOfValues, WorkerPool.TimedOut): # It's possible to raise some other exceptions here, # but we will use the logic below. successes = worker_pool.get_successes() finally: worker_pool.cancel() worker_pool.join() accepted_arrangements = { ursula: arrangement for ursula, arrangement in successes.values() } failures = worker_pool.get_failures() accepted_addresses = ", ".join(ursula.checksum_address for ursula in accepted_arrangements) if len(accepted_arrangements) < self.n: rejected_proposals = "\n".join( f"{address}: {value}" for address, (type_, value, traceback) in failures.items()) self.log.debug( "Could not find enough Ursulas to accept proposals.\n" f"Accepted: {accepted_addresses}\n" f"Rejected:\n{rejected_proposals}") raise self._not_enough_ursulas_exception() else: self.log.debug( f"Finished proposing arrangements; accepted: {accepted_addresses}" ) return accepted_arrangements