def fast_tally_ballots( ballots: Sequence[CiphertextBallot], pool: Optional[Pool] = None, ) -> TALLY_TYPE: """ This function does a tally of the given list of ballots, returning a dictionary that maps from selection object_ids to the ElGamalCiphertext that corresponds to the encrypted tally of that selection. An optional `Pool` may be passed in, and it will be used to evaluate the ElGamal accumulation in parallel. If it's absent, then the accumulation will happen sequentially. Progress bars are not currently supported. """ iter_count = 1 initial_tallies: Sequence[TALLY_INPUT_TYPE] = ballots while True: if pool is None or len(initial_tallies) <= BALLOTS_PER_SHARD: log_and_print( f"tally iteration {iter_count} (FINAL): {len(initial_tallies)} partial tallies" ) return sequential_tally(initial_tallies) shards = shard_list_uniform(initial_tallies, BALLOTS_PER_SHARD) log_and_print( f"tally iteration {iter_count}: {len(initial_tallies)} partial tallies --> {len(shards)} shards" ) partial_tallies: Sequence[TALLY_TYPE] = pool.map(func=sequential_tally, iterable=shards) iter_count += 1 initial_tallies = partial_tallies
def sequential_tally( ptallies: Sequence[Optional[TALLY_INPUT_TYPE]]) -> TALLY_TYPE: """ Internal function: sequentially tallies all of the ciphertext ballots, or other partial tallies, and returns a partial tally. If any input tally happens to be `None` or an empty dict, the result is an empty dict. """ # log_and_print(f"Sequential, local tally with {len(ptallies)} inputs") num_nones = sum([1 for p in ptallies if p is None or p == {}]) if num_nones > 0 in ptallies: log_and_print( f"Found {num_nones} failed partial tallies, returning an empty tally" ) return {} result: TALLY_TYPE = {} for ptally in ptallies: # we want do our computation purely in terms of TALLY_TYPE, so we'll convert CiphertextBallots if isinstance(ptally, CiphertextBallot): ptally = ciphertext_ballot_to_dict(ptally) if ptally is None: # should never happen, but paranoia to keep the type system happy return {} for k in ptally.keys(): if k not in result: result[k] = ptally[k] else: counter_sum = result[k] counter_partial = ptally[k] counter_sum = elgamal_add(counter_sum, counter_partial) result[k] = counter_sum return result
def write_file_with_retries( full_file_name: Union[str, PurePath], contents: AnyStr, # bytes or str num_retries: int = 1, ) -> None: """ Helper function: given a fully resolved file path, or a path-like object describing a file location, writes the given contents to the a file of that name, and if it fails, tries it again and again (based on the `num_retries` parameter). This works around occasional failures that happen, for no good reason, with s3fs-fuse in big clouds. """ prev_exception = None write_mode = "w" if isinstance(contents, str) else "wb" for retry_number in range(0, num_retries): try: with open(full_file_name, write_mode) as f: f.write(contents) return except Exception as e: prev_exception = e log_and_print( f"failed to write {full_file_name} (attempt #{retry_number}): {str(e)}" ) if num_retries > 1: log_and_print( f"giving up writing {full_file_name}: failed {num_retries} times") if prev_exception: raise prev_exception
def mkdir_helper(p: Union[str, Path], num_retries: int = 1) -> None: """ Wrapper around `os.mkdir` that will work correctly even if the directory already exists. """ prev_exception = None if isinstance(p, str): path = Path(p) else: path = p for attempt in range(0, num_retries): try: path.mkdir(parents=True, exist_ok=True) return except Exception as e: prev_exception = e log_and_print( f"failed to make directory {p} (attempt {attempt}): {str(e)}") if num_retries > 1: log_and_print( f"failed to make directory {p} after {num_retries} attempts, failing" ) if prev_exception: raise prev_exception
def r_partial_tally( progressbar_actor: Optional[ActorHandle], *ptallies: Optional[TALLY_TYPE], ) -> Optional[TALLY_TYPE]: # pragma: no cover """ This is a front-end for `partial_tally`, that can be called remotely via Ray. """ try: result = partial_tally(progressbar_actor, *ptallies) return result except Exception as e: log_and_print(f"Unexpected exception in r_partial_tally: {e}", True) return None
def r_decrypt(cec: CiphertextElectionContext, keypair: ElGamalKeyPair, di: DecryptInput) -> Optional[DecryptOutput]: # pragma: no cover """ Remotely decrypts an ElGamalCiphertext (and its related data -- see DecryptInput) and returns the plaintext along with a Chaum-Pedersen proof (see DecryptOutput). """ try: plaintext, proof = decrypt_ciphertext_with_proof( di.ciphertext, keypair, di.seed, cec.crypto_extended_base_hash) return DecryptOutput(di.object_id, plaintext, proof) except Exception as e: log_and_print(f"Unexpected exception in r_decrypt: {e}", True) return None
def r_verify_ballot_proofs( manifest: Manifest, public_key: ElementModP, hash_header: ElementModQ, progressbar_actor: Optional[ActorHandle], *cballot_filenames: str, ) -> Optional[TALLY_TYPE]: # pragma: no cover """ Given a list of ballots, verify their Chaum-Pedersen proofs and redo the tally. Returns `None` if anything didn't verify correctly, otherwise a partial tally of the ballots (of type `TALLY_TYPE`). """ # We're never moving ciphertext ballots through Ray's remote object system. Instead, # we've got filenames coming in. We load the ciphertext ballots, verify them, and # we're immediately done with them. This puts a lot of pressure on the filesystem # but S3 buckets, Azure blob storage, etc. can handle it. try: valid_count = 0 num_ballots = len(cballot_filenames) ptallies: List[TALLY_TYPE] = [] for name in cballot_filenames: cballot = manifest.load_ciphertext_ballot(name) if cballot is None: return None is_valid = cballot.is_valid_encryption(cballot.description_hash, public_key, hash_header) if is_valid: valid_count = valid_count + 1 if progressbar_actor is not None: progressbar_actor.update_completed.remote("Ballots", 1) ptallies.append(ciphertext_ballot_to_dict(cballot)) if valid_count < num_ballots: # log_and_print(f"Only {valid_count} of {num_ballots} ballots are valid.") return None ptally = sequential_tally(ptallies) if progressbar_actor is not None: progressbar_actor.update_completed.remote("Tallies", num_ballots) return ptally except Exception as e: log_and_print(f"Unexpected exception in r_verify_ballot_proofs: {e}", True) return None
def equivalent(self, other: "Manifest") -> bool: """ Not exactly checking equality, but does check that the manifests are "equivalent", which means we're ignoring the root directories, but checking the rest. """ same_bytes = self.bytes_written == other.bytes_written same_hashes = self.hashes == other.hashes if not same_hashes: for k in self.hashes.keys(): if self.hashes[k] != other.hashes[k]: log_and_print(f"different values for key {k}") return same_hashes and same_bytes
def r_verify_tally_selection_proofs( public_key: ElementModP, hash_header: ElementModQ, *selections: SelectionInfo, ) -> bool: # pragma: no cover """ Given a list of tally selections, verifies that every one's internal proof is correct. """ try: results = [ s.is_valid_proof(public_key, hash_header) for s in selections ] return all(results) except Exception as e: log_and_print( f"Unexpected exception in r_verify_tally_selection_proofs: {e}", True) return False
def test_end_to_end_publications_ray(self, input: str, check_proofs: bool, keypair: ElGamalKeyPair) -> None: self.removeTree( ) # if there's anything leftover from a prior run, get rid of it cvrs = read_dominion_csv(StringIO(input)) self.assertIsNotNone(cvrs) _, ballots, _ = cvrs.to_election_description() assert len(ballots) > 0, "can't have zero ballots!" results = ray_tally_everything( cvrs, secret_key=keypair.secret_key, verbose=True, root_dir=TALLY_TESTING_DIR, ) self.assertTrue(results.all_proofs_valid()) # dump files out to disk write_ray_tally(results, TALLY_TESTING_DIR) log_and_print( "tally_testing written, proceeding to read it back in again") # now, read it back again! results2 = load_ray_tally( TALLY_TESTING_DIR, check_proofs=check_proofs, verbose=True, recheck_ballots_and_tallies=True, ) self.assertIsNotNone(results2) log_and_print("tally_testing got non-null result!") self.assertTrue( _list_eq(results.encrypted_ballots, results2.encrypted_ballots)) self.assertTrue(results.equivalent(results2, keypair)) self.removeTree() # clean up our mess
def ray_decrypt_tally( tally: TALLY_TYPE, cec: ObjectRef, # ObjectRef[CiphertextElectionContext] keypair: ObjectRef, # ObjectRef[ElGamalKeyPair] proof_seed: ElementModQ, ) -> DECRYPT_TALLY_OUTPUT_TYPE: """ Given a tally, this decrypts the tally and returns a dict from selection object_ids to tuples containing the decrypted total as well as a Chaum-Pedersen proof that the total corresponds to the ciphertext. :param tally: an election tally :param cec: a Ray ObjectRef containing a `CiphertextElectionContext` :param keypair: a Ray ObjectRef containing an `ElGamalKeyPair` :param proof_seed: an ElementModQ """ tkeys = tally.keys() proof_seeds: List[ElementModQ] = Nonces(proof_seed)[0:len(tkeys)] inputs: List[DecryptInput] = [ DecryptInput(object_id, seed, tally[object_id]) for seed, object_id in zip(proof_seeds, tkeys) ] # We can't be lazy here: we need to have all this data in hand so we can # rearrange it into a dictionary and return it. result: List[Optional[DecryptOutput]] = ray.get( [r_decrypt.remote(cec, keypair, x) for x in inputs]) if None in result: log_and_print( f"Unexpected failure from in ray_decrypt_tally, returning an empty dict", True, ) return {} # mypy can't figure this that None isn't here any more, so we need to check for None again return { r.object_id: (r.plaintext, r.decryption_proof) for r in result if r is not None }
def all_proofs_valid( self, verbose: bool = False, recheck_ballots_and_tallies: bool = False, use_progressbar: bool = True, ) -> bool: """ Checks all the proofs used in this tally, returns True if everything is good. Any errors found will be logged. Normally, this only checks the proofs associated with the totals. If you want to also recompute the tally (i.e., tabulate the encrypted ballots) and verify every individual ballot proof, then set `recheck_ballots_and_tallies` to True. """ ray_wait_for_workers(min_workers=2) log_and_print("Verifying proofs.", verbose) r_public_key = ray.put(self.context.elgamal_public_key) r_hash_header = ray.put(self.context.crypto_extended_base_hash) start = timer() selections = self.tally.map.values() sharded_selections: Sequence[ Sequence[SelectionInfo]] = shard_list_uniform(selections, 2) # parallelizing this is overkill, but why not? results: List[bool] = ray.get([ r_verify_tally_selection_proofs.remote(r_public_key, r_hash_header, *s) for s in sharded_selections ]) end = timer() log_and_print(f"Verification time: {end - start: .3f} sec", verbose) log_and_print( f"Verification rate: {len(self.tally.map.keys()) / (end - start): .3f} selection/sec", verbose, ) if False in results: return False if recheck_ballots_and_tallies: if self.manifest is None: log_and_print( "cannot recheck ballots and tallies without a manifest") return False # next, check each individual ballot's proofs; in this case, we're going to always # show the progress bar, even if verbose is false num_ballots = self.num_ballots r_manifest = ray.put(self.manifest) progressbar = (ProgressBar({ "Ballots": num_ballots, "Tallies": num_ballots, "Iterations": 0, "Batch": 0, }) if use_progressbar else None) progressbar_actor = progressbar.actor if progressbar is not None else None ballot_start = timer() batches: Sequence[Sequence[str]] = shard_list_uniform( self.cvr_metadata["BallotId"], BATCH_SIZE) # List[ObjectRef[Optional[TALLY_TYPE]]] recomputed_tallies: List[ObjectRef] = [] for batch in batches: if progressbar_actor: progressbar_actor.update_completed.remote("Batch", 1) cballot_manifest_name_shards: Sequence[ Sequence[str]] = shard_list_uniform( batch, BALLOTS_PER_SHARD) # List[ObjectRef[Optional[TALLY_TYPE]]] ballot_results: List[ObjectRef] = [ r_verify_ballot_proofs.remote( r_manifest, r_public_key, r_hash_header, progressbar_actor, *shard, ) for shard in cballot_manifest_name_shards ] # ray.wait( # ballot_results, # num_returns=len(cballot_manifest_name_shards), # timeout=None, # ) # log_and_print("Recomputing tallies.", verbose) ptally = ray_tally_ballots(ballot_results, PARTIAL_TALLIES_PER_SHARD, progressbar) recomputed_tallies.append(ptally) if len(recomputed_tallies) > 1: recomputed_tally = ray.get( ray_tally_ballots(recomputed_tallies, PARTIAL_TALLIES_PER_SHARD, progressbar)) else: recomputed_tally = ray.get(recomputed_tallies[0]) if progressbar: progressbar.close() if not recomputed_tally: return False ballot_end = timer() log_and_print( f"Ballot verification rate: {num_ballots / (ballot_end - ballot_start): .3f} ballot/sec", True, ) tally_success = tallies_match(self.tally.to_tally_map(), recomputed_tally) if not tally_success: return False return True
def ray_reduce_with_rounds( inputs: Iterable[ObjectRef], shard_size: int, reducer_first_arg: Any, reducer: Callable, # Callable[[Any, VarArg(ObjectRef)], ObjectRef] progressbar: Optional[ProgressBar] = None, progressbar_key: Optional[str] = None, verbose: bool = False, ) -> ObjectRef: """ Given a list of inputs and a Ray remote reducer, manages the Ray cluster to wait for the values when they're ready, and call the reducer to ultimately get down to a single value. Unlike `ray_reduce_with_ray_wait`, this version builds a reduction tree. It depends on an associative property for the reducer, but not a commutative property. The `shard_size` parameter specifies how many inputs should be fed to each call to the reducer. Since the available data will vary, the actual number fed to the reducer will be at least two and at most `shard_size`. The `reducer` is a Ray remote method reference that takes a given first argument of whatever type and then a varargs sequence of objectrefs, and returns an objectref. So, if you had code that looked like: ``` @ray.remote def my_reducer(config: Config, *inputs: MyDataType) -> MyDataType: ... ``` And let's say you're mapping some remote function to generate those values and later want to reduce them. That code might look like this: ``` @ray.remote def my_mapper(input: SomethingElse) -> MyDataType: ... def run_everything(config: Config, inputs: Iterable[SomethingElse]) -> MyDataType: map_refs = [my_mapper.remote(i) for i in inputs] return ray_reduce_with_rounds(map_refs, 10, config, my_reducer.remote) ``` If your `reducer_first_arg` corresponds to some large object that you don't want to serialize over and over, you could of course call `ray_put` on it first and pass that along. Optional feature: integration with the progressbar in `ray_progress`. Just pass in the ProgressBar as well as the `key` string that you want to use. Whenever more work is being dispatched, the progressbar's total amount of work is updated by the dispatcher here. The work completion notification is *not* handled here. That needs to be done by the remote reducer. (Why? Because it might want to update the progressbar for each element in the shard while here we could only see when the whole shard is completed.) """ # TODO: generalize this code so the `reducer_first_arg` is wrapped up in the reducer. # This seems like a job for `kwargs`. Deal with that after everything else works. assert (progressbar_key and progressbar ) or not progressbar, "progress bar requires a key string" assert shard_size > 1, "shard_size must be greater than one" progressbar_actor = progressbar.actor if progressbar is not None else None iter_count = 1 result: Optional[ObjectRef] = None inputs = list(inputs) while True: num_inputs = len(inputs) if progressbar_actor is not None: progressbar_actor.update_completed.remote("Iterations", 1) progressbar_actor.update_total.remote(progressbar_key, num_inputs) if num_inputs <= shard_size: log_and_print(f"Reduction (FINAL): {num_inputs} partial results", verbose=verbose) result = reducer(reducer_first_arg, *inputs) break # Sequence[Sequence[ObjectRef[Optional[TALLY_TYPE]]]] shards: Sequence[Sequence[ObjectRef]] = shard_list_uniform( inputs, shard_size) log_and_print( f"Reduction {iter_count:2d}: {num_inputs:6d} partial results --> {len(shards)} shards (bps = {shard_size})", verbose=verbose, ) # Sequence[ObjectRef[Optional[TALLY_TYPE]]] partial_results: List[ObjectRef] = [ reducer(reducer_first_arg, *shard) for shard in shards ] # To avoid deeply nested tasks, we're going to wait for this to finish. # If you comment out the call to ray.wait(), everything still works, but # you can get warnings about too many tasks. # ray.wait(partial_results, num_returns=len(partial_results), timeout=None) iter_count += 1 inputs = partial_results if progressbar: progressbar.print_until_done() assert result is not None, "while loop shouldn't have broken without setting result" return result
def all_proofs_valid( self, pool: Optional[Pool] = None, verbose: bool = True, recheck_ballots_and_tallies: bool = False, ) -> bool: """ Checks all the proofs used in this tally, returns True if everything is good. Any errors found will be logged. Normally, this only checks the proofs associated with the totals. If you want to also recompute the tally (i.e., tabulate the encrypted ballots) and verify every individual ballot proof, then set `recheck_ballots_and_tallies` to True. """ wrapped_func = functools.partial( verify_tally_selection_proof, self.context.elgamal_public_key, self.context.crypto_extended_base_hash, ) start = timer() inputs = self.tally.map.values() if verbose: # pragma: no cover inputs = tqdm(list(inputs), "Tally proof") result: List[bool] = ([wrapped_func(x) for x in inputs] if pool is None else pool.map( func=wrapped_func, iterable=inputs)) end = timer() log_and_print(f"Verification time: {end - start: .3f} sec", verbose) log_and_print( f"Verification rate: {len(self.tally.map.keys()) / (end - start): .3f} selection/sec", verbose, ) if False in result: return False if recheck_ballots_and_tallies: # first, try to load all the ballots and make sure there are no hash errors if not self.all_files_present(): return False # next, check each individual ballot's proofs; in this case, we're going to always # show the progress bar, even if verbose is false ballot_iter = tqdm(self.encrypted_ballots, desc="Ballot proofs") ballot_func = functools.partial(verify_ballot_proof, self.context) ballot_start = timer() ballot_result: List[bool] = ([ballot_func(x) for x in ballot_iter] if pool is None else pool.map(func=ballot_func, iterable=ballot_iter)) ballot_end = timer() log_and_print( f"Ballot verification rate: {len(self.encrypted_ballots) / (ballot_end - ballot_start): .3f} ballot/sec", verbose, ) if False in ballot_result: return False log_and_print("Recomputing tallies:", verbose) recomputed_tally = fast_tally_ballots(self.encrypted_ballots, pool) tally_success = tallies_match(self.tally.to_tally_map(), recomputed_tally) if not tally_success: return False return True
def ray_tally_everything( cvrs: DominionCSV, verbose: bool = True, use_progressbar: bool = True, date: Optional[datetime] = None, seed_hash: Optional[ElementModQ] = None, master_nonce: Optional[ElementModQ] = None, secret_key: Optional[ElementModQ] = None, root_dir: Optional[str] = None, ) -> "RayTallyEverythingResults": """ This top-level function takes a collection of Dominion CVRs and produces everything that we might want for arlo-e2e: a list of encrypted ballots, their encrypted and decrypted tally, and proofs of the correctness of the whole thing. The election `secret_key` is an optional parameter. If absent, a random keypair is generated and used. Similarly, if a `seed_hash` or `master_nonce` is not provided, random ones are generated and used. For parallelism, Ray is used. Make sure you've called `ray.init()` or `ray_localhost_init()` before calling this. If `root_dir` is specified, then the tally is written out to the specified directory, and the resulting `RayTallyEverythingResults` object will support the methods that allow those ballots to be read back in again. Conversely, if `root_dir` is `None`, then nothing is written to disk, and the result will not have access to individual ballots. """ rows, cols = cvrs.data.shape ray_wait_for_workers(min_workers=2) if date is None: date = datetime.now() if root_dir is not None: mkdir_helper(root_dir, num_retries=NUM_WRITE_RETRIES) r_manifest_aggregator = ManifestAggregatorActor.remote( root_dir) # type: ignore else: r_manifest_aggregator = None r_root_dir = ray.put(root_dir) start_time = timer() # Performance note: by using to_election_description_ray rather than to_election_description, we're # only getting back a list of dictionaries rather than a list of PlaintextBallots. We're pushing that # work out into the nodes, where it will run in parallel. The BallotPlaintextFactory wraps up all # the (immutable) state necessary to convert from these dicts to PlaintextBallots and is meant to # be sent to every node in the cluster. ed, bpf, ballot_dicts, id_map = cvrs.to_election_description_ray(date=date) setup_time = timer() num_ballots = len(ballot_dicts) assert num_ballots > 0, "can't have zero ballots!" log_and_print( f"ElectionGuard setup time: {setup_time - start_time: .3f} sec, {num_ballots / (setup_time - start_time):.3f} ballots/sec" ) keypair = (elgamal_keypair_random() if secret_key is None else elgamal_keypair_from_secret(secret_key)) assert keypair is not None, "unexpected failure with keypair computation" secret_key, public_key = keypair cec = make_ciphertext_election_context( number_of_guardians=1, quorum=1, elgamal_public_key=public_key, description_hash=ed.crypto_hash(), ) r_cec = ray.put(cec) ied = InternalElectionDescription(ed) r_ied = ray.put(ied) if seed_hash is None: seed_hash = rand_q() r_seed_hash = ray.put(seed_hash) r_keypair = ray.put(keypair) r_ballot_plaintext_factory = ray.put(bpf) if master_nonce is None: master_nonce = rand_q() nonces = Nonces(master_nonce) r_nonces = ray.put(nonces) nonce_indices = range(num_ballots) inputs = list(zip(ballot_dicts, nonce_indices)) batches = shard_list_uniform(inputs, BATCH_SIZE) num_batches = len(batches) log_and_print( f"Launching Ray.io remote encryption! (number of batches: {num_batches})" ) start_time = timer() progressbar = (ProgressBar({ "Ballots": num_ballots, "Tallies": num_ballots, "Iterations": 0, "Batch": 0, }) if use_progressbar else None) progressbar_actor = progressbar.actor if progressbar is not None else None batch_tallies: List[ObjectRef] = [] for batch in batches: if progressbar_actor: progressbar_actor.update_completed.remote("Batch", 1) num_ballots_in_batch = len(batch) sharded_inputs = shard_list_uniform(batch, BALLOTS_PER_SHARD) num_shards = len(sharded_inputs) partial_tally_refs = [ r_encrypt_and_write.remote( r_ied, r_cec, r_seed_hash, r_root_dir, r_manifest_aggregator, progressbar_actor, r_ballot_plaintext_factory, r_nonces, right_tuple_list(shard), *(left_tuple_list(shard)), ) for shard in sharded_inputs ] # log_and_print("Remote tallying.") btally = ray_tally_ballots(partial_tally_refs, BALLOTS_PER_SHARD, progressbar) batch_tallies.append(btally) # Each batch ultimately yields one partial tally; we add these up here at the # very end. If we have a million ballots and have batches of 10k ballots, this # would mean we'd have only 100 partial tallies. So, what's here works just fine. # If we wanted, we could certainly burn some scalar time and keep a running, # singular, partial tally. It's probably more important to push onward to the # next batch, so we can do as much work in parallel as possible. if len(batch_tallies) > 1: tally = ray.get(ray_tally_ballots(batch_tallies, 10, progressbar)) else: tally = ray.get(batch_tallies[0]) if progressbar: progressbar.close() assert tally is not None, "tally failed!" log_and_print("Tally decryption.") decrypted_tally: DECRYPT_TALLY_OUTPUT_TYPE = ray_decrypt_tally( tally, r_cec, r_keypair, seed_hash) log_and_print("Validating tally.") # Sanity-checking logic: make sure we don't have any unexpected keys, and that the decrypted totals # match up with the columns in the original plaintext data. tally_keys = set(decrypted_tally.keys()) expected_keys = set(id_map.keys()) assert tally_keys.issubset( expected_keys ), f"bad tally keys (actual keys: {sorted(tally_keys)}, expected keys: {sorted(expected_keys)})" for obj_id in decrypted_tally.keys(): cvr_sum = int(cvrs.data[id_map[obj_id]].sum()) decryption, proof = decrypted_tally[obj_id] assert cvr_sum == decryption, f"decryption failed for {obj_id}" final_manifest: Optional[Manifest] = None if root_dir is not None: final_manifest = ray.get(r_manifest_aggregator.result.remote()) assert isinstance( final_manifest, Manifest), "type error: bad result from manifest aggregation" # Assemble the data structure that we're returning. Having nonces in the ciphertext makes these # structures sensitive for writing out to disk, but otherwise they're ready to go. log_and_print("Constructing results.") reported_tally: Dict[str, SelectionInfo] = { k: SelectionInfo( object_id=k, encrypted_tally=tally[k], # we need to forcibly convert mpz to int here to make serialization work properly decrypted_tally=int(decrypted_tally[k][0]), proof=decrypted_tally[k][1], ) for k in tally.keys() } tabulate_time = timer() log_and_print( f"Encryption and tabulation: {rows} ballots, {rows / (tabulate_time - start_time): .3f} ballot/sec", verbose, ) return RayTallyEverythingResults( metadata=cvrs.metadata, cvr_metadata=cvrs.dataframe_without_selections(), election_description=ed, num_ballots=rows, manifest=final_manifest, tally=SelectionTally(reported_tally), context=cec, )
def fast_tally_everything( cvrs: DominionCSV, pool: Optional[Pool] = None, verbose: bool = True, date: Optional[datetime] = None, seed_hash: Optional[ElementModQ] = None, master_nonce: Optional[ElementModQ] = None, secret_key: Optional[ElementModQ] = None, use_progressbar: bool = True, ) -> FastTallyEverythingResults: """ This top-level function takes a collection of Dominion CVRs and produces everything that we might want for arlo-e2e: a list of encrypted ballots, their encrypted and decrypted tally, and proofs of the correctness of the whole thing. The election `secret_key` is an optional parameter. If absent, a random keypair is generated and used. Similarly, if a `seed_hash` or `master_nonce` is not provided, random ones are generated and used. For parallelism, a `multiprocessing.pool.Pool` may be provided, and should result in significant speedups on multicore computers. If absent, the computation will proceed sequentially. """ rows, cols = cvrs.data.shape if date is None: date = datetime.now() parse_time = timer() log_and_print(f"Rows: {rows}, cols: {cols}", verbose) ed, ballots, id_map = cvrs.to_election_description(date=date) assert len(ballots) > 0, "can't have zero ballots!" keypair = (elgamal_keypair_random() if secret_key is None else elgamal_keypair_from_secret(secret_key)) assert keypair is not None, "unexpected failure with keypair computation" secret_key, public_key = keypair # This computation exists only to cause side-effects in the DLog engine, so the lame nonce is not an issue. assert len(ballots) == get_optional( elgamal_encrypt(m=len(ballots), nonce=int_to_q_unchecked(3), public_key=public_key)).decrypt( secret_key), "got wrong ElGamal decryption!" dlog_prime_time = timer() log_and_print( f"DLog prime time (n={len(ballots)}): {dlog_prime_time - parse_time: .3f} sec", verbose, ) cec = make_ciphertext_election_context( number_of_guardians=1, quorum=1, elgamal_public_key=public_key, description_hash=ed.crypto_hash(), ) ied = InternalElectionDescription(ed) # REVIEW THIS: is this cryptographically sound? Is the seed_hash properly a secret? Should # it go in the output? The nonces are clearly secret. If you know them, you can decrypt. if seed_hash is None: seed_hash = rand_q() if master_nonce is None: master_nonce = rand_q() nonces: List[ElementModQ] = Nonces(master_nonce)[0:len(ballots)] # even if verbose is false, we still want to see the progress bar for the encryption cballots = fast_encrypt_ballots(ballots, ied, cec, seed_hash, nonces, pool, use_progressbar=use_progressbar) eg_encrypt_time = timer() log_and_print( f"Encryption time: {eg_encrypt_time - dlog_prime_time: .3f} sec", verbose) log_and_print( f"Encryption rate: {rows / (eg_encrypt_time - dlog_prime_time): .3f} ballot/sec", verbose, ) tally: TALLY_TYPE = fast_tally_ballots(cballots, pool) eg_tabulate_time = timer() log_and_print( f"Tabulation time: {eg_tabulate_time - eg_encrypt_time: .3f} sec", verbose) log_and_print( f"Tabulation rate: {rows / (eg_tabulate_time - eg_encrypt_time): .3f} ballot/sec", verbose, ) log_and_print( f"Encryption and tabulation: {rows} ballots / {eg_tabulate_time - dlog_prime_time: .3f} sec = {rows / (eg_tabulate_time - dlog_prime_time): .3f} ballot/sec", verbose, ) assert tally is not None, "tally failed!" if verbose: # pragma: no cover print("Decryption & Proofs: ") decrypted_tally: DECRYPT_TALLY_OUTPUT_TYPE = fast_decrypt_tally( tally, cec, keypair, seed_hash, pool, verbose) eg_decryption_time = timer() log_and_print( f"Decryption time: {eg_decryption_time - eg_tabulate_time: .3f} sec", verbose) log_and_print( f"Decryption rate: {len(decrypted_tally.keys()) / (eg_decryption_time - eg_tabulate_time): .3f} selection/sec", verbose, ) # Sanity-checking logic: make sure we don't have any unexpected keys, and that the decrypted totals # match up with the columns in the original plaintext data. for obj_id in decrypted_tally.keys(): assert obj_id in id_map, "object_id in results that we don't know about!" cvr_sum = int(cvrs.data[id_map[obj_id]].sum()) decryption, proof = decrypted_tally[obj_id] assert cvr_sum == decryption, f"decryption failed for {obj_id}" # Assemble the data structure that we're returning. Having nonces in the ciphertext makes these # structures sensitive for writing out to disk, but otherwise they're ready to go. reported_tally: Dict[str, SelectionInfo] = { k: SelectionInfo( object_id=k, encrypted_tally=tally[k], # we need to forcibly convert mpz to int here to make serialization work properly decrypted_tally=int(decrypted_tally[k][0]), proof=decrypted_tally[k][1], ) for k in tally.keys() } # strips the ballots of their nonces, which is important because those could allow for decryption accepted_ballots = [ciphertext_ballot_to_accepted(x) for x in cballots] return FastTallyEverythingResults( metadata=cvrs.metadata, cvr_metadata=cvrs.dataframe_without_selections(), election_description=ed, encrypted_ballot_memos={ ballot.object_id: make_memo_value(ballot) for ballot in accepted_ballots }, tally=SelectionTally(reported_tally), context=cec, )
def read_dominion_csv(file: Union[str, StringIO]) -> Optional[DominionCSV]: """ Given a filename of a Dominion CSV (or a StringIO buffer with the same data), tries to read it. If successful, you get back a named-tuple which describes the election. The contest map is a dictionary. The keys are the titles of the contests, and the values are a second level of dictionary, mapping from the name of each choice to the ultimate string that's used as a column identifier in the Pandas dataframe. """ try: df = pd.read_csv( file, header=[0, 1, 2, 3], quoting=csv.QUOTE_MINIMAL, sep=",", engine="python", ) except FileNotFoundError: return None except pd.errors.ParserError: return None # TODO: At this point, we know the file is a valid CSV and we're *assuming* it's a valid Dominion file. # We shouldn't make that assumption, but checking for it would be really tricky. filtered_columns = [[ fix_strings(e) for e in c if (not e.startswith("Unnamed:") and not e == '""') ] for c in df.columns] election_name = filtered_columns[0][0] # The first two columns have the election name and a version number in them, so we have to treat those specially, # otherwise, we're looking for columns with only one thing in them, which says that they're not a contest (with # choices) but instead they're one of the metadata columns. ballot_metadata_fields: List[str] = ( filtered_columns[0][1:] + filtered_columns[1][1:] + [x[0] for x in filtered_columns[2:] if len(x) == 1]) df = df.applymap(fix_strings) column_names = [ filtered_columns[0][1:], filtered_columns[1][1:], ] + filtered_columns[2:] # new_column_names, max_votes_for_map = _fixup_column_names(column_names) max_votes_for_map: Dict[str, int] = {} vote_for_n_pattern = re.compile(r"\s*\(Vote For=(\d+)\)$") new_column_names: List[str] = [] contests = [] selection_uid_iter = UidMaker("s") # We might have a case where we have two candidates in the same contest which # are identical. This particularly occurs when we have "pick k of n" contests # where we might have k write-in slots. Our solution, since we want candidate # names within a contest to be unique, so we can use them as dictionary keys, # elsewhere in our code, is to append "(2)", "(3)", etc. # Since write-ins or other repeats (e.g., "FOR" / "AGAINST" in referenda) might # happen in multiple races, we only need the names to be unique within a specific # contest. Thus, we've got the dictionary below to track everything. # Meanwhile, we're also learning the "k" in "k of n" by parsing it straight # out of the contest name. # Yes, this code is complex, but then so is the file format we're parsing. seen_candidate: Dict[Tuple[str, str], int] = {} for column in column_names: vote_for_n = 1 # until proven otherwise column = [str(x) for x in column] # force everything to be a string title = column[0] vote_for_n_match: Optional[re.Match] = vote_for_n_pattern.search(title) if vote_for_n_match is not None: vote_for_n = int(vote_for_n_match.group(1)) # chop off the "(Vote For=N)" part title = title[0:vote_for_n_match.span()[0]] max_votes_for_map[title] = vote_for_n new_column = [title] + column[1:] if len(column) > 1: candidate = column[1] if (title, candidate) in seen_candidate: seen_candidate[(title, candidate)] += 1 new_column = ( [title] + [f"{candidate} ({seen_candidate[(title, candidate)]})"] + column[2:]) else: seen_candidate[(title, candidate)] = 1 contests.append(new_column) new_column_names.append(" | ".join(new_column)) # Now we're going to extract a mapping from contest titles to all the choices. contest_keys = set() all_parties: Set[str] = set() contest_map_builder: Dict[str, List[SelectionMetadata]] = {} contest_key_to_title: Dict[str, str] = {} contest_titles: List[str] = [] for contest in contests: title = contest[0] candidate = contest[1] party = fix_party_string(contest[2]) if len(contest) > 2 else "" if party not in all_parties and party != "": all_parties.add(party) if party != "": key = " | ".join([title, candidate, party]) else: key = " | ".join([title, candidate]) contest_keys.add(key) if title not in contest_map_builder: contest_map_builder[title] = [] contest_titles.append(title) # goes from ["Representative - District 1"] to a list of SelectionMetadata objects uid_int, uid_str = selection_uid_iter.next_int() metadata = SelectionMetadata( object_id=uid_str, sequence_number=uid_int, contest_name=title, choice_name=candidate, party_name=party, ) contest_map_builder[title].append(metadata) # goes from "Representative - District 1 | Alice | DEM" to "Representative - District 1" contest_key_to_title[metadata.to_string()] = title df.columns = new_column_names df["Guid"] = df.apply( lambda r: dominion_row_to_uid(r, election_name, ballot_metadata_fields ), axis=1, ) # If there are any duplicated metadata rows, something is really wrong with the data. # The "Guid" concatenates all the metadata, so it's perfect for this sanity check. num_duplicates = df["Guid"].duplicated().sum() if num_duplicates > 0: log_and_print( f"Error: {num_duplicates} duplicated metadata rows found") return None # If the election official put numbers in as their ballot types, that's going to cause type # errors, because we really want to deal with them as strings. df["BallotType"] = df["BallotType"].apply(lambda s: str(s)) # there's probably an easier way to do this, but it does what we want ballot_uid_iter = UidMaker("b") df["BallotId"] = df.apply( lambda r: ballot_uid_iter.next(), axis=1, ) if "BallotType" not in df: return None ballotstyle_uids = UidMaker("ballotstyle") all_ballot_types = sorted(set(df["BallotType"])) ballot_type_to_bsid = { bt: ballotstyle_uids.next() for bt in all_ballot_types } contest_map: CONTEST_MAP = { k: set(contest_map_builder[k]) for k in contest_map_builder.keys() } style_map: STYLE_MAP = {} # extract a list of dictionaries that have two keys: BallotType and BallotId ballot_id_and_types: List[Dict[str, str]] = df[["BallotType", "BallotId" ]].to_dict(orient="records") # boil this down to a dictionary from BallotId to BallotType ballot_id_to_ballot_type: Dict[str, str] = { elem["BallotId"]: elem["BallotType"] for elem in ballot_id_and_types } # We're computing a set-union of all the non-empty contest fields we find, in any ballot # sharing a given BallotType setting, i.e., we're inferring which contests are actually # a part of each BallotType. # Potential degenerate result: in a race with very few ballots cast, it's conceivable that # every single ballot will undervote an entire contest. In this specific circumstance, # the style map will be "wrong", which would mean that that entire contest would be # completely missing from subsequent e2e crypto results. Hopefully, actual Dominion CVRs # will have zeros rather than blank cells to represent these undervotes, and then this case # will never occur. Otherwise, it's unclear how we'd ever be able to distinguish between # a contest that's completely undervoted versus a contest that's not part of a ballot style. # For each ballot style: # - fetch all the rows of a given ballot type (e.g., b24 = df[df['BallotType'] == "Ballot 24 - Type 24"]) # - convert to true/false based on whether we have "not a number" and "add" (e.g., totals = b24.notna().sum()) # - add up the totals for each choice # - if that contest_total is non-zero, then it's a contest that's included in the race, otherwise not # mapping from the name of a contest to a list of all the columns names that have selections for that contest contest_choices: Dict[str, List[str]] = { contest: [selection.to_string() for selection in contest_map[contest]] for contest in contest_map.keys() } for bt in all_ballot_types: # Expanded math, useful for debugging: # ballots_of_bt = df[df["BallotType"] == bt] # column_trues = ballots_of_bt.notna() # column_true_sums = column_trues.sum() # All in one line, might run faster with Modin's query optimizer? column_true_sums = df[df["BallotType"] == bt].notna().sum() sums_per_contest = { contest: column_true_sums[contest_choices[contest]].sum() for contest in contest_choices.keys() } non_zero_contests = { contest for contest in contest_choices.keys() if sums_per_contest[contest] > 0 } style_map[bt] = non_zero_contests return DominionCSV( ElectionMetadata( fix_strings(election_name), ballot_type_to_bsid, ballot_id_to_ballot_type, all_parties, style_map, contest_map, max_votes_for_map, contest_titles, ), df, ballot_metadata_fields + ["Guid", "BallotId"], )
def r_encrypt_and_write( ied: InternalElectionDescription, cec: CiphertextElectionContext, seed_hash: ElementModQ, root_dir: Optional[str], manifest_aggregator: Optional[ActorHandle], progressbar_actor: Optional[ActorHandle], bpf: BallotPlaintextFactory, nonces: Nonces, nonce_indices: List[int], *plaintext_ballot_dicts: Dict[str, Any], ) -> Optional[TALLY_TYPE]: # pragma: no cover """ Remotely encrypts a list of ballots and their associated nonces. If a `root_dir` is specified, the encrypted ballots are written to disk, otherwise no disk activity. What's returned is a `RemoteTallyResult`. If the ballots were written, the `manifest_aggregator` actor will be notified. A "partial tally" of the encrypted ballots is returned. """ try: manifest = make_fresh_manifest( root_dir) if root_dir is not None else None num_ballots = len(plaintext_ballot_dicts) assert (len(nonce_indices) == num_ballots ), "mismatching numbers of nonces and ballots!" assert num_ballots > 0, "need at least one ballot" ptally_final: Optional[TALLY_TYPE] = None for i in range(0, num_ballots): pballot = bpf.row_to_plaintext_ballot(plaintext_ballot_dicts[i]) cballot = ciphertext_ballot_to_accepted( get_optional( encrypt_ballot( pballot, ied, cec, seed_hash, nonces[nonce_indices[i]], should_verify_proofs=False, ))) if manifest is not None: manifest.write_ciphertext_ballot(cballot, num_retries=NUM_WRITE_RETRIES) if progressbar_actor is not None: progressbar_actor.update_completed.remote("Ballots", 1) ptally = ciphertext_ballot_to_dict(cballot) ptally_final = (sequential_tally([ptally_final, ptally]) if ptally_final else ptally) if progressbar_actor is not None: progressbar_actor.update_completed.remote("Tallies", 1) if manifest is not None and manifest_aggregator is not None: manifest_aggregator.add.remote(manifest) return ptally_final except Exception as e: log_and_print(f"Unexpected exception in r_encrypt_and_write: {e}", True) return None
def test_end_to_end_publications(self, input: str, check_proofs: bool, keypair: ElGamalKeyPair) -> None: coverage.process_startup( ) # necessary for coverage testing to work in parallel self.removeTree( ) # if there's anything leftover from a prior run, get rid of it cvrs = read_dominion_csv(StringIO(input)) self.assertIsNotNone(cvrs) _, ballots, _ = cvrs.to_election_description() assert len(ballots) > 0, "can't have zero ballots!" results = fast_tally_everything(cvrs, self.pool, secret_key=keypair.secret_key, verbose=True) self.assertTrue(results.all_proofs_valid(self.pool)) # dump files out to disk write_fast_tally(results, TALLY_TESTING_DIR) log_and_print( "tally_testing written, proceeding to read it back in again") # now, read it back again! results2 = load_fast_tally( TALLY_TESTING_DIR, check_proofs=check_proofs, pool=self.pool, verbose=True, recheck_ballots_and_tallies=True, ) self.assertIsNotNone(results2) log_and_print("tally_testing got non-null result!") self.assertTrue( _list_eq(results.encrypted_ballots, results2.encrypted_ballots)) self.assertTrue(results.equivalent(results2, keypair, self.pool)) # Make sure there's an index.html file; throws an exception if it's missing self.assertIsNotNone(stat(path.join(TALLY_TESTING_DIR, "index.html"))) # And lastly, while we're here, we'll use all this machinery to exercise the ballot decryption # read/write facilities. ied = InternalElectionDescription(results.election_description) log_and_print("decrypting one more time") pballots = decrypt_ballots( ied, results.context.crypto_extended_base_hash, keypair, self.pool, results.encrypted_ballots, ) self.assertEqual(len(pballots), len(results.encrypted_ballots)) self.assertNotIn(None, pballots) # for speed, we're only going to do this for the first ballot, not all of them pballot = pballots[0] eballot = results.encrypted_ballots[0] bid = pballot.ballot.object_id self.assertTrue( verify_proven_ballot_proofs( results.context.crypto_extended_base_hash, keypair.public_key, eballot, pballot, )) write_proven_ballot(pballot, DECRYPTED_DIR) self.assertTrue(exists_proven_ballot(bid, DECRYPTED_DIR)) self.assertFalse(exists_proven_ballot(bid + "0", DECRYPTED_DIR)) self.assertEqual(pballot, load_proven_ballot(bid, DECRYPTED_DIR)) self.removeTree() # clean up our mess