async def run_async( self, pc_instance: PrivateComputationInstance, server_ips: Optional[List[str]] = None, ) -> PrivateComputationInstance: """Runs the pcf2.0 based private aggregation stage Args: pc_instance: the private computation instance to run aggregation stage server_ips: only used by the partner role. These are the ip addresses of the publisher's containers. Returns: An updated version of pc_instance that stores an MPCInstance """ # Prepare arguments for attribution game game_args = self._get_compute_metrics_game_args(pc_instance, ) # We do this check here because depends on how game_args is generated, len(game_args) could be different, # but we will always expect server_ips == len(game_args) if server_ips and len(server_ips) != len(game_args): raise ValueError( f"Unable to rerun MPC pcf2.0 based aggregation because there is a mismatch between the number of server ips given ({len(server_ips)}) and the number of containers ({len(game_args)}) to be spawned." ) # Create and start MPC instance to run MPC compute logging.info( "Starting to run MPC instance for pcf2.0 based aggregation stage.") stage_data = PrivateComputationServiceData.PCF2_AGGREGATION_STAGE_DATA binary_name = OneDockerBinaryNames.PCF2_AGGREGATION.value game_name = checked_cast(str, stage_data.game_name) binary_config = self._onedocker_binary_config_map[binary_name] retry_counter_str = str(pc_instance.retry_counter) mpc_instance = await create_and_start_mpc_instance( mpc_svc=self._mpc_service, instance_id=pc_instance.instance_id + "_" + GameNames.PCF2_AGGREGATION.value + retry_counter_str, game_name=game_name, mpc_party=map_private_computation_role_to_mpc_party( pc_instance.role), num_containers=len(game_args), binary_version=binary_config.binary_version, server_ips=server_ips, game_args=game_args, container_timeout=self._container_timeout, repository_path=binary_config.repository_path, ) logging.info( "MPC instance started running for pcf2.0 based aggregation stage.") # Push MPC instance to PrivateComputationInstance.instances and update PL Instance status pc_instance.instances.append( PCSMPCInstance.from_mpc_instance(mpc_instance)) return pc_instance
def get_updated_pc_status_mpc_game( private_computation_instance: PrivateComputationInstance, mpc_svc: MPCService, ) -> PrivateComputationInstanceStatus: """Updates the MPCInstances and gets latest PrivateComputationInstance status Arguments: private_computation_instance: The PC instance that is being updated mpc_svc: Used to update MPC instances stored on private_computation_instance Returns: The latest status for private_computation_instance """ status = private_computation_instance.status if private_computation_instance.instances: # Only need to update the last stage/instance last_instance = private_computation_instance.instances[-1] if not isinstance(last_instance, MPCInstance): return status # MPC service has to call update_instance to get the newest containers # information in case they are still running private_computation_instance.instances[ -1] = PCSMPCInstance.from_mpc_instance( mpc_svc.update_instance(last_instance.instance_id)) mpc_instance_status = private_computation_instance.instances[-1].status current_stage = private_computation_instance.current_stage if mpc_instance_status is MPCInstanceStatus.STARTED: status = current_stage.started_status elif mpc_instance_status is MPCInstanceStatus.COMPLETED: status = current_stage.completed_status elif mpc_instance_status in ( MPCInstanceStatus.FAILED, MPCInstanceStatus.CANCELED, ): status = current_stage.failed_status return status
def update(self, instance: MPCInstance) -> None: self.repo.update(PCSMPCInstance.from_mpc_instance(instance))
async def run_async( self, pc_instance: PrivateComputationInstance, server_ips: Optional[List[str]] = None, ) -> PrivateComputationInstance: """Runs the private computation aggregate metrics stage Args: pc_instance: the private computation instance to run aggregate metrics with server_ips: only used by the partner role. These are the ip addresses of the publisher's containers. Returns: An updated version of pc_instance that stores an MPCInstance """ num_shards = (pc_instance.num_mpc_containers * pc_instance.num_files_per_mpc_container) # TODO T101225989: map aggregation_type from the compute stage to metrics_format_type metrics_format_type = ( "lift" if pc_instance.game_type is PrivateComputationGameType.LIFT else "ad_object") binary_name = OneDockerBinaryNames.SHARD_AGGREGATOR.value binary_config = self._onedocker_binary_config_map[binary_name] # Get output path of previous stage depending on what stage flow we are using # Using "PrivateComputationDecoupledStageFlow" instead of PrivateComputationDecoupledStageFlow.get_cls_name() to avoid # circular import error. if pc_instance.get_flow_cls_name in [ "PrivateComputationDecoupledStageFlow", "PrivateComputationDecoupledLocalTestStageFlow", ]: input_stage_path = pc_instance.decoupled_aggregation_stage_output_base_path elif pc_instance.get_flow_cls_name in [ "PrivateComputationPCF2StageFlow", "PrivateComputationPCF2LocalTestStageFlow", ]: input_stage_path = pc_instance.pcf2_aggregation_stage_output_base_path elif pc_instance.get_flow_cls_name == "PrivateComputationPCF2LiftStageFlow": input_stage_path = pc_instance.pcf2_lift_stage_output_base_path else: input_stage_path = pc_instance.compute_stage_output_base_path if self._log_cost_to_s3: run_name = pc_instance.instance_id if pc_instance.post_processing_data: pc_instance.post_processing_data.s3_cost_export_output_paths.add( f"sa-logs/{run_name}_{pc_instance.role.value.title()}.json", ) else: run_name = "" if self._is_validating: # num_containers_real_data is the number of containers processing real data # synthetic data is processed by a dedicated extra container, and this container is always the last container, # hence synthetic_data_shard_start_index = num_real_data_shards # each of the containers, processing real or synthetic data, processes the same number of shards due to our resharding mechanism # num_shards representing the total number of shards which is equal to num_real_data_shards + num_synthetic_data_shards # hence, when num_containers_real_data and num_shards are given, num_synthetic_data_shards = num_shards / (num_containers_real_data + 1) num_containers_real_data = pc_instance.num_pid_containers if num_containers_real_data is None: raise ValueError("num_containers_real_data is None") num_synthetic_data_shards = num_shards // ( num_containers_real_data + 1) num_real_data_shards = num_shards - num_synthetic_data_shards synthetic_data_shard_start_index = num_real_data_shards # Create and start MPC instance for real data shards and synthetic data shards game_args = [ { "input_base_path": input_stage_path, "num_shards": num_real_data_shards, "metrics_format_type": metrics_format_type, "output_path": pc_instance.shard_aggregate_stage_output_path, "first_shard_index": 0, "threshold": pc_instance.k_anonymity_threshold, "run_name": run_name, "log_cost": self._log_cost_to_s3, }, { "input_base_path": input_stage_path, "num_shards": num_synthetic_data_shards, "metrics_format_type": metrics_format_type, "output_path": pc_instance.shard_aggregate_stage_output_path + "_synthetic_data_shards", "first_shard_index": synthetic_data_shard_start_index, "threshold": pc_instance.k_anonymity_threshold, "run_name": run_name, "log_cost": self._log_cost_to_s3, }, ] # We should only export visibility to scribe when it's set if pc_instance.result_visibility is not ResultVisibility.PUBLIC: result_visibility = int(pc_instance.result_visibility) for arg in game_args: arg["visibility"] = result_visibility mpc_instance = await create_and_start_mpc_instance( mpc_svc=self._mpc_service, instance_id=pc_instance.instance_id + "_aggregate_shards" + str(pc_instance.retry_counter), game_name=GameNames.SHARD_AGGREGATOR.value, mpc_party=map_private_computation_role_to_mpc_party( pc_instance.role), num_containers=2, binary_version=binary_config.binary_version, server_ips=server_ips, game_args=game_args, container_timeout=self._container_timeout, ) else: # Create and start MPC instance game_args = [ { "input_base_path": input_stage_path, "metrics_format_type": metrics_format_type, "num_shards": num_shards, "output_path": pc_instance.shard_aggregate_stage_output_path, "threshold": pc_instance.k_anonymity_threshold, "run_name": run_name, "log_cost": self._log_cost_to_s3, }, ] # We should only export visibility to scribe when it's set if pc_instance.result_visibility is not ResultVisibility.PUBLIC: result_visibility = int(pc_instance.result_visibility) for arg in game_args: arg["visibility"] = result_visibility mpc_instance = await create_and_start_mpc_instance( mpc_svc=self._mpc_service, instance_id=pc_instance.instance_id + "_aggregate_shards" + str(pc_instance.retry_counter), game_name=GameNames.SHARD_AGGREGATOR.value, mpc_party=map_private_computation_role_to_mpc_party( pc_instance.role), num_containers=1, binary_version=binary_config.binary_version, server_ips=server_ips, game_args=game_args, container_timeout=self._container_timeout, repository_path=binary_config.repository_path, ) # Push MPC instance to PrivateComputationInstance.instances and update PL Instance status pc_instance.instances.append( PCSMPCInstance.from_mpc_instance(mpc_instance)) return pc_instance