Example #1
0
    async def run_async(
        self,
        pc_instance: PrivateComputationInstance,
        server_ips: Optional[List[str]] = None,
    ) -> PrivateComputationInstance:
        """Runs the pcf2.0 based private aggregation stage

        Args:
            pc_instance: the private computation instance to run aggregation stage
            server_ips: only used by the partner role. These are the ip addresses of the publisher's containers.

        Returns:
            An updated version of pc_instance that stores an MPCInstance
        """

        # Prepare arguments for attribution game
        game_args = self._get_compute_metrics_game_args(pc_instance, )

        # We do this check here because depends on how game_args is generated, len(game_args) could be different,
        #   but we will always expect server_ips == len(game_args)
        if server_ips and len(server_ips) != len(game_args):
            raise ValueError(
                f"Unable to rerun MPC pcf2.0 based aggregation because there is a mismatch between the number of server ips given ({len(server_ips)}) and the number of containers ({len(game_args)}) to be spawned."
            )

        # Create and start MPC instance to run MPC compute
        logging.info(
            "Starting to run MPC instance for pcf2.0 based aggregation stage.")

        stage_data = PrivateComputationServiceData.PCF2_AGGREGATION_STAGE_DATA
        binary_name = OneDockerBinaryNames.PCF2_AGGREGATION.value
        game_name = checked_cast(str, stage_data.game_name)

        binary_config = self._onedocker_binary_config_map[binary_name]
        retry_counter_str = str(pc_instance.retry_counter)
        mpc_instance = await create_and_start_mpc_instance(
            mpc_svc=self._mpc_service,
            instance_id=pc_instance.instance_id + "_" +
            GameNames.PCF2_AGGREGATION.value + retry_counter_str,
            game_name=game_name,
            mpc_party=map_private_computation_role_to_mpc_party(
                pc_instance.role),
            num_containers=len(game_args),
            binary_version=binary_config.binary_version,
            server_ips=server_ips,
            game_args=game_args,
            container_timeout=self._container_timeout,
            repository_path=binary_config.repository_path,
        )

        logging.info(
            "MPC instance started running for pcf2.0 based aggregation stage.")

        # Push MPC instance to PrivateComputationInstance.instances and update PL Instance status
        pc_instance.instances.append(
            PCSMPCInstance.from_mpc_instance(mpc_instance))
        return pc_instance
Example #2
0
    def _get_attribution_game_args(
        self,
        private_computation_instance: PrivateComputationInstance,
        common_compute_game_args: Dict[str, Any],
    ) -> List[Dict[str, Any]]:
        """Gets attribution specific game args to be passed to game binaries by onedocker

        When onedocker spins up containers to run games, it unpacks a dictionary containing the
        arguments required by the game binary being ran. This function prepares arguments specific to
        attribution games.

        Args:
            pc_instance: the private computation instance to generate game args for

        Returns:
            MPC game args to be used by onedocker
        """
        game_args = []
        aggregation_type = checked_cast(
            AggregationType, private_computation_instance.aggregation_type)
        attribution_rule = checked_cast(
            AttributionRule, private_computation_instance.attribution_rule)
        game_args = [{
            **common_compute_game_args,
            **{
                "aggregators":
                aggregation_type.value,
                "attribution_rules":
                attribution_rule.value,
                "file_start_index":
                i * private_computation_instance.num_files_per_mpc_container,
                "use_xor_encryption":
                True,
                "run_name":
                private_computation_instance.instance_id if self._log_cost_to_s3 else "",
                "max_num_touchpoints":
                private_computation_instance.padding_size,
                "max_num_conversions":
                private_computation_instance.padding_size,
            },
        } for i in range(private_computation_instance.num_mpc_containers)]
        return game_args
    def _get_compute_metrics_game_args(
        self,
        private_computation_instance: PrivateComputationInstance,
    ) -> List[Dict[str, Any]]:
        """Gets the game args passed to game binaries by onedocker

        When onedocker spins up containers to run games, it unpacks a dictionary containing the
        arguments required by the game binary being ran. This function prepares that dictionary.

        Args:
            pc_instance: the private computation instance to generate game args for

        Returns:
            MPC game args to be used by onedocker
        """

        attribution_rule = checked_cast(
            AttributionRule, private_computation_instance.attribution_rule)
        if self._log_cost_to_s3:
            run_name = (private_computation_instance.instance_id + "_" +
                        GameNames.PCF2_ATTRIBUTION.value)
            if private_computation_instance.post_processing_data:
                private_computation_instance.post_processing_data.s3_cost_export_output_paths.add(
                    f"att-logs/{run_name}_{private_computation_instance.role.value.title()}.json"
                )
        else:
            run_name = ""

        common_game_args = {
            "input_base_path":
            private_computation_instance.data_processing_output_path,
            "output_base_path": private_computation_instance.
            pcf2_attribution_stage_output_base_path,
            "num_files":
            private_computation_instance.num_files_per_mpc_container,
            "concurrency": private_computation_instance.concurrency,
            "run_name": run_name,
            "max_num_touchpoints": private_computation_instance.padding_size,
            "max_num_conversions": private_computation_instance.padding_size,
            "log_cost": self._log_cost_to_s3,
            "attribution_rules": attribution_rule.value,
            "use_xor_encryption": True,
            "use_postfix": True,
        }

        game_args = [{
            **common_game_args,
            **{
                "file_start_index":
                i * private_computation_instance.num_files_per_mpc_container,
            },
        } for i in range(private_computation_instance.num_mpc_containers)]

        return game_args
    async def run(
        self,
        stage_input: PIDStageInput,
        container_timeout: Optional[int] = None,
        wait_for_containers: bool = True,
    ) -> PIDStageStatus:
        self.logger.info(f"[{self}] Called run")
        instance_id = stage_input.instance_id
        timeout = container_timeout or DEFAULT_CONTAINER_TIMEOUT_IN_SEC
        # Make sure status is READY before proceed
        status = await self._ready(stage_input)
        await self.update_instance_status(instance_id=instance_id,
                                          status=status)
        if status is not PIDStageStatus.READY:
            return status

        # Some invariant checking on the input and output paths
        input_paths = stage_input.input_paths
        output_paths = stage_input.output_paths
        num_shards = (stage_input.num_shards + 1
                      if stage_input.is_validating else stage_input.num_shards)
        if len(input_paths) != 1:
            raise ValueError(f"Expected 1 input path, not {len(input_paths)}")
        if len(output_paths) != 1:
            raise ValueError(
                f"Expected 1 output path, not {len(output_paths)}")

        await self.update_instance_status(instance_id=instance_id,
                                          status=PIDStageStatus.STARTED)
        if stage_input.pid_use_row_numbers:
            self.logger.info("use-row-numbers is enabled for Private ID")
        if self.stage_type is UnionPIDStage.PUBLISHER_RUN_PID:
            # Run publisher commands in container
            self.logger.info("Publisher spinning up containers")
            try:
                binary = OneDockerBinaryNames.PID_SERVER.value
                if self.protocol == PIDProtocol.UNION_PID_MULTIKEY:
                    binary = OneDockerBinaryNames.PID_MULTI_KEY_SERVER.value
                pending_containers = self.onedocker_svc.start_containers(
                    package_name=binary,
                    version=self.onedocker_binary_config.binary_version,
                    cmd_args_list=self._gen_command_args_list(
                        input_path=input_paths[0],
                        output_path=output_paths[0],
                        num_shards=num_shards,
                        use_row_numbers=stage_input.pid_use_row_numbers
                        and (self.protocol != PIDProtocol.UNION_PID_MULTIKEY),
                        disable_metric_logging=False,
                    ),
                    env_vars=self._gen_env_vars(),
                    timeout=timeout,
                )

                containers = await self.onedocker_svc.wait_for_pending_containers(
                    [
                        container.instance_id
                        for container in pending_containers
                    ])
            except Exception as e:
                status = PIDStageStatus.FAILED
                await self.update_instance_status(instance_id=instance_id,
                                                  status=status)
                self.logger.exception(f"Failed to spin up containers: {e}")
                return status

            # Write containers information to PID instance repository
            await self.update_instance_containers(instance_id=instance_id,
                                                  containers=containers)

            # Get ips from containers and write them to pid instance repository
            self.logger.info("Storing servers' IPs")
            ip_addresses = [
                checked_cast(str, container.ip_address)
                for container in containers
            ]
            await self.put_server_ips(instance_id=instance_id,
                                      server_ips=ip_addresses)

            # Wait until the containers are finished
            if wait_for_containers:
                self.logger.info("Waiting for containers to finish")
                containers = await RunBinaryBaseService.wait_for_containers_async(
                    self.onedocker_svc, containers, SLEEP_UPDATE_SECONDS)
                await self.update_instance_containers(instance_id=instance_id,
                                                      containers=containers)
            status = self.get_stage_status_from_containers(containers)
        elif self.stage_type is UnionPIDStage.ADV_RUN_PID:
            server_ips = self.server_ips or []
            if not server_ips:
                self.logger.error("Missing server_ips")
                status = PIDStageStatus.FAILED
                await self.update_instance_status(instance_id=instance_id,
                                                  status=status)
                return status

            hostnames = [f"http://{ip}" for ip in server_ips]

            # Run partner commands in container
            self.logger.info("Partner spinning up containers")
            try:
                binary = OneDockerBinaryNames.PID_CLIENT.value
                if self.protocol == PIDProtocol.UNION_PID_MULTIKEY:
                    binary = OneDockerBinaryNames.PID_MULTI_KEY_CLIENT.value
                pending_containers = self.onedocker_svc.start_containers(
                    package_name=binary,
                    version=self.onedocker_binary_config.binary_version,
                    cmd_args_list=self._gen_command_args_list(
                        input_path=input_paths[0],
                        output_path=output_paths[0],
                        num_shards=num_shards,
                        server_hostnames=hostnames,
                        use_row_numbers=stage_input.pid_use_row_numbers
                        and (self.protocol != PIDProtocol.UNION_PID_MULTIKEY),
                        disable_metric_logging=True,
                    ),
                    env_vars=self._gen_env_vars(),
                    timeout=timeout,
                )

                containers = await self.onedocker_svc.wait_for_pending_containers(
                    [
                        container.instance_id
                        for container in pending_containers
                    ])
            except Exception as e:
                status = PIDStageStatus.FAILED
                await self.update_instance_status(instance_id=instance_id,
                                                  status=status)
                self.logger.exception(f"Failed to spin up containers: {e}")
                return status

            # Write containers information to PID instance repository
            await self.update_instance_containers(instance_id=instance_id,
                                                  containers=containers)

            if wait_for_containers:
                # Wait until the containers are finished
                self.logger.info("Waiting for containers to finish")
                containers = await RunBinaryBaseService.wait_for_containers_async(
                    self.onedocker_svc, containers, SLEEP_UPDATE_SECONDS)
                await self.update_instance_containers(instance_id=instance_id,
                                                      containers=containers)
            status = self.get_stage_status_from_containers(containers)

        self.logger.info(f"PID Run protocol status: {status}")
        await self.update_instance_status(instance_id=instance_id,
                                          status=status)
        return status
Example #5
0
async def start_combiner_service(
    private_computation_instance: PrivateComputationInstance,
    onedocker_svc: OneDockerService,
    onedocker_binary_config_map: DefaultDict[str, OneDockerBinaryConfig],
    combine_output_path: str,
    log_cost_to_s3: bool = DEFAULT_LOG_COST_TO_S3,
    wait_for_containers: bool = False,
    max_id_column_count: int = 1,
) -> List[ContainerInstance]:
    """Run combiner service and return those container instances

    Args:
        private_computation_instance: The PC instance to run combiner service with
        onedocker_svc: Spins up containers that run binaries in the cloud
        onedocker_binary_config_map: Stores a mapping from mpc game to OneDockerBinaryConfig (binary version and tmp directory)
        combine_output_path: out put path for the combine result
        log_cost_to_s3: if money cost of the computation will be logged to S3
        wait_for_containers: block until containers to finish running, default False

    Returns:
        return: list of container instances running combiner service
    """
    stage_data = PrivateComputationServiceData.get(
        private_computation_instance.game_type).combiner_stage

    binary_name = stage_data.binary_name
    binary_config = onedocker_binary_config_map[binary_name]

    # TODO: T106159008 Add on attribution specific args
    if private_computation_instance.game_type is PrivateComputationGameType.ATTRIBUTION:
        run_name = private_computation_instance.instance_id if log_cost_to_s3 else ""
        padding_size = checked_cast(int,
                                    private_computation_instance.padding_size)
        log_cost = log_cost_to_s3
    else:
        run_name = None
        padding_size = None
        log_cost = None

    combiner_service = checked_cast(
        IdSpineCombinerService,
        stage_data.service,
    )

    args = combiner_service.build_args(
        spine_path=private_computation_instance.pid_stage_output_spine_path,
        data_path=private_computation_instance.pid_stage_output_data_path,
        output_path=combine_output_path,
        num_shards=private_computation_instance.num_pid_containers +
        1 if private_computation_instance.is_validating else
        private_computation_instance.num_pid_containers,
        tmp_directory=binary_config.tmp_directory,
        max_id_column_cnt=max_id_column_count,
        run_name=run_name,
        padding_size=padding_size,
        log_cost=log_cost,
    )
    env_vars = {ONEDOCKER_REPOSITORY_PATH: binary_config.repository_path}
    return await combiner_service.start_containers(
        cmd_args_list=args,
        onedocker_svc=onedocker_svc,
        binary_version=binary_config.binary_version,
        binary_name=binary_name,
        timeout=None,
        wait_for_containers_to_finish=wait_for_containers,
        env_vars=env_vars,
    )
 def server_ips(self) -> List[str]:
     return [
         checked_cast(str, container.ip_address)
         for container in self.containers
     ]
Example #7
0
    def _get_compute_metrics_game_args(
        self,
        private_computation_instance: PrivateComputationInstance,
    ) -> List[Dict[str, Any]]:
        """Gets the game args passed to game binaries by onedocker

        When onedocker spins up containers to run games, it unpacks a dictionary containing the
        arguments required by the game binary being ran. This function prepares that dictionary.

        Args:
            pc_instance: the private computation instance to generate game args for

        Returns:
            MPC game args to be used by onedocker
        """

        logging.info(
            f"PATH : {private_computation_instance.pcf2_attribution_stage_output_base_path}"
        )
        aggregation_type = checked_cast(
            AggregationType, private_computation_instance.aggregation_type)
        # pcf2.0 based aggregation game does not need the attribution rule. Passing it here just for cost logging,
        # so that we get to know how much the game cost for a aggregation format and attribution rule.
        attribution_rule = checked_cast(
            AttributionRule, private_computation_instance.attribution_rule)

        if self._log_cost_to_s3:
            run_name = private_computation_instance.instance_id
            if private_computation_instance.post_processing_data:
                private_computation_instance.post_processing_data.s3_cost_export_output_paths.add(
                    f"agg-logs/{run_name}_{private_computation_instance.role.value.title()}.json",
                )
        else:
            run_name = ""

        common_game_args = {
            "input_base_path":
            private_computation_instance.data_processing_output_path,
            "output_base_path": private_computation_instance.
            pcf2_aggregation_stage_output_base_path,
            "num_files":
            private_computation_instance.num_files_per_mpc_container,
            "concurrency": private_computation_instance.concurrency,
            "aggregators": aggregation_type.value,
            "attribution_rules": attribution_rule.value,
            "input_base_path_secret_share": private_computation_instance.
            pcf2_attribution_stage_output_base_path,
            "use_xor_encryption": True,
            "use_postfix": True,
            "run_name": run_name,
            "max_num_touchpoints": private_computation_instance.padding_size,
            "max_num_conversions": private_computation_instance.padding_size,
            "log_cost": self._log_cost_to_s3,
        }

        game_args = [{
            **common_game_args,
            **{
                "file_start_index":
                i * private_computation_instance.num_files_per_mpc_container,
            },
        } for i in range(private_computation_instance.num_mpc_containers)]

        return game_args