Exemple #1
0
  def test_get_customized_loss(self, loss_function_using_logits, expected_train,
                               expected_test):

    def fake_loss(x, y):
      return 2 * x + y

    attack_input = AttackInputData(
        logits_train=np.array([
            123.,
        ]),
        logits_test=np.array([
            123.,
        ]),
        probs_train=np.array([
            456.,
        ]),
        probs_test=np.array([
            456.,
        ]),
        labels_train=np.array([1.]),
        labels_test=np.array([-1.]),
        loss_function=fake_loss,
        loss_function_using_logits=loss_function_using_logits,
    )
    np.testing.assert_allclose(attack_input.get_loss_train(), expected_train)
    np.testing.assert_allclose(attack_input.get_loss_test(), expected_test)
def _run_attack(attack_input: AttackInputData,
                attack_type: AttackType,
                balance_attacker_training: bool = True,
                min_num_samples: int = 1):
    """Runs membership inference attacks for specified input and type.

  Args:
    attack_input: input data for running an attack
    attack_type: the attack to run
    balance_attacker_training: Whether the training and test sets for the
      membership inference attacker should have a balanced (roughly equal)
      number of samples from the training and test sets used to develop the
      model under attack.
    min_num_samples: minimum number of examples in either training or test data.

  Returns:
    the attack result.
  """
    attack_input.validate()
    if min(attack_input.get_train_size(),
           attack_input.get_test_size()) < min_num_samples:
        return None

    if attack_type.is_trained_attack:
        return _run_trained_attack(attack_input, attack_type,
                                   balance_attacker_training)
    if attack_type == AttackType.THRESHOLD_ENTROPY_ATTACK:
        return _run_threshold_entropy_attack(attack_input)
    return _run_threshold_attack(attack_input)
def run_membership_probability_analysis(
        attack_input: AttackInputData,
        slicing_spec: SlicingSpec = None) -> MembershipProbabilityResults:
    """Perform membership probability analysis on all given slice types.

  Args:
    attack_input: input data for compute membership probabilities
    slicing_spec: specifies attack_input slices

  Returns:
    the membership probability results.
  """
    attack_input.validate()
    membership_prob_results = []

    if slicing_spec is None:
        slicing_spec = SlicingSpec(entire_dataset=True)
    num_classes = None
    if slicing_spec.by_class:
        num_classes = attack_input.num_classes
    input_slice_specs = get_single_slice_specs(slicing_spec, num_classes)
    for single_slice_spec in input_slice_specs:
        attack_input_slice = get_slice(attack_input, single_slice_spec)
        membership_prob_results.append(
            _compute_membership_probability(attack_input_slice))

    return MembershipProbabilityResults(
        membership_prob_results=membership_prob_results)
def _compute_membership_probability(
        attack_input: AttackInputData,
        num_bins: int = 15) -> SingleMembershipProbabilityResult:
    """Computes each individual point's likelihood of being a member (denoted as privacy risk score in https://arxiv.org/abs/2003.10595).

  For an individual sample, its privacy risk score is computed as the posterior
  probability of being in the training set
  after observing its prediction output by the target machine learning model.

  Args:
    attack_input: input data for compute membership probability
    num_bins: the number of bins used to compute the training/test histogram

  Returns:
    membership probability results
  """

    # Uses the provided loss or entropy. Otherwise computes the loss.
    if attack_input.loss_train is not None and attack_input.loss_test is not None:
        train_values = attack_input.loss_train
        test_values = attack_input.loss_test
    elif attack_input.entropy_train is not None and attack_input.entropy_test is not None:
        train_values = attack_input.entropy_train
        test_values = attack_input.entropy_test
    else:
        train_values = attack_input.get_loss_train()
        test_values = attack_input.get_loss_test()

    # Compute the histogram in the log scale
    small_value = 1e-10
    train_values = np.maximum(train_values, small_value)
    test_values = np.maximum(test_values, small_value)

    min_value = min(train_values.min(), test_values.min())
    max_value = max(train_values.max(), test_values.max())
    bins_hist = np.logspace(np.log10(min_value), np.log10(max_value),
                            num_bins + 1)

    train_hist, _ = np.histogram(train_values, bins=bins_hist)
    train_hist = train_hist / (len(train_values) + 0.0)
    train_hist_indices = np.fmin(np.digitize(train_values, bins=bins_hist),
                                 num_bins) - 1

    test_hist, _ = np.histogram(test_values, bins=bins_hist)
    test_hist = test_hist / (len(test_values) + 0.0)
    test_hist_indices = np.fmin(np.digitize(test_values, bins=bins_hist),
                                num_bins) - 1

    combined_hist = train_hist + test_hist
    combined_hist[combined_hist == 0] = small_value
    membership_prob_list = train_hist / (combined_hist + 0.0)
    train_membership_probs = membership_prob_list[train_hist_indices]
    test_membership_probs = membership_prob_list[test_hist_indices]

    return SingleMembershipProbabilityResult(
        slice_spec=_get_slice_spec(attack_input),
        train_membership_probs=train_membership_probs,
        test_membership_probs=test_membership_probs)
Exemple #5
0
  def test_get_entropy_explicitly_provided(self):
    attack_input = AttackInputData(
        entropy_train=np.array([0.0, 2.0, 1.0]),
        entropy_test=np.array([0.5, 3.0, 5.0]))

    np.testing.assert_equal(attack_input.get_entropy_train().tolist(),
                            [0.0, 2.0, 1.0])
    np.testing.assert_equal(attack_input.get_entropy_test().tolist(),
                            [0.5, 3.0, 5.0])
Exemple #6
0
  def test_get_loss_explicitly_provided(self):
    attack_input = AttackInputData(
        loss_train=np.array([1.0, 3.0, 6.0]),
        loss_test=np.array([1.0, 4.0, 6.0]))

    np.testing.assert_equal(attack_input.get_loss_train().tolist(),
                            [1.0, 3.0, 6.0])
    np.testing.assert_equal(attack_input.get_loss_test().tolist(),
                            [1.0, 4.0, 6.0])
Exemple #7
0
  def test_get_probs_sizes(self):
    attack_input = AttackInputData(
        probs_train=np.array([[0.1, 0.1, 0.8], [0.8, 0.2, 0]]),
        probs_test=np.array([[0, 0.0001, 0.9999]]),
        labels_train=np.array([1, 0]),
        labels_test=np.array([0]))

    np.testing.assert_equal(attack_input.get_train_size(), 2)
    np.testing.assert_equal(attack_input.get_test_size(), 1)
Exemple #8
0
  def test_get_xe_loss_from_probs(self):
    attack_input = AttackInputData(
        probs_train=np.array([[0.1, 0.1, 0.8], [0.8, 0.2, 0]]),
        probs_test=np.array([[0, 0.0001, 0.9999], [0.07, 0.18, 0.75]]),
        labels_train=np.array([1, 0]),
        labels_test=np.array([0, 2]))

    np.testing.assert_allclose(
        attack_input.get_loss_train(), [2.30258509, 0.2231436], atol=1e-7)
    np.testing.assert_allclose(
        attack_input.get_loss_test(), [18.42068074, 0.28768207], atol=1e-7)
Exemple #9
0
  def test_get_xe_loss_from_logits(self):
    attack_input = AttackInputData(
        logits_train=np.array([[-0.3, 1.5, 0.2], [2, 3, 0.5]]),
        logits_test=np.array([[2, 0.3, 0.2], [0.3, -0.5, 0.2]]),
        labels_train=np.array([1, 0]),
        labels_test=np.array([0, 2]))

    np.testing.assert_allclose(
        attack_input.get_loss_train(), [0.36313551, 1.37153903], atol=1e-7)
    np.testing.assert_allclose(
        attack_input.get_loss_test(), [0.29860897, 0.95618669], atol=1e-7)
Exemple #10
0
 def test_get_binary_xe_loss_from_logits(self):
   attack_input = AttackInputData(
       logits_train=np.array([-10, -5, 0., 5, 10]),
       logits_test=np.array([-10, -5, 0., 5, 10]),
       labels_train=np.zeros((5,)),
       labels_test=np.ones((5,)),
       loss_function_using_logits=True)
   expected_loss0 = np.array([0.000045398, 0.006715348, 0.6931471825, 5, 10])
   np.testing.assert_allclose(
       attack_input.get_loss_train(), expected_loss0, rtol=1e-2)
   np.testing.assert_allclose(
       attack_input.get_loss_test(), expected_loss0[::-1], rtol=1e-2)
Exemple #11
0
def create_attacker_data(attack_input_data: data_structures.AttackInputData,
                         balance: bool = True) -> AttackerData:
    """Prepare AttackInputData to train ML attackers.

  Combines logits and losses and performs a random train-test split.

  Args:
    attack_input_data: Original AttackInputData
    balance: Whether the training and test sets for the membership inference
      attacker should have a balanced (roughly equal) number of samples from the
      training and test sets used to develop the model under attack.

  Returns:
    AttackerData.
  """
    attack_input_train = _column_stack(attack_input_data.logits_or_probs_train,
                                       attack_input_data.get_loss_train())
    attack_input_test = _column_stack(attack_input_data.logits_or_probs_test,
                                      attack_input_data.get_loss_test())

    ntrain, ntest = attack_input_train.shape[0], attack_input_test.shape[0]
    features_all = np.concatenate((attack_input_train, attack_input_test))
    labels_all = np.concatenate((np.zeros(ntrain), np.ones(ntest)))

    fold_indices = np.arange(ntrain + ntest)
    left_out_indices = np.asarray([], dtype=np.int32)

    if balance:
        idx_train, idx_test = range(ntrain), range(ntrain, ntrain + ntest)
        min_size = min(ntrain, ntest)
        if ntrain > min_size:
            left_out_size = ntrain - min_size
            perm_train = np.random.permutation(idx_train)  # shuffle training
            left_out_indices = perm_train[:left_out_size]
            fold_indices = np.concatenate(
                (perm_train[left_out_size:], idx_test))
        elif ntest > min_size:
            left_out_size = ntest - min_size
            perm_test = np.random.permutation(idx_test)  # shuffle test
            left_out_indices = perm_test[:left_out_size]
            fold_indices = np.concatenate(
                (perm_test[left_out_size:], idx_train))

    # Shuffle indices for the downstream attackers.
    fold_indices = np.random.permutation(fold_indices)

    return AttackerData(features_all=features_all,
                        labels_all=labels_all,
                        fold_indices=fold_indices,
                        left_out_indices=left_out_indices,
                        data_size=data_structures.DataSize(ntrain=ntrain,
                                                           ntest=ntest))
Exemple #12
0
 def test_default_loss_function_using_logits(self, logits, probs, expected):
   """Tests for `loss_function_using_logits = None`. Should prefer logits."""
   attack_input = AttackInputData(
       logits_train=logits,
       logits_test=logits,
       probs_train=probs,
       probs_test=probs,
       labels_train=np.array([1, 0.]),
       labels_test=np.array([1, 0.]),
       loss_function=LossFunction.SQUARED,
   )
   np.testing.assert_allclose(attack_input.get_loss_train(), expected)
   np.testing.assert_allclose(attack_input.get_loss_test(), expected)
Exemple #13
0
 def test_get_squared_loss(self, loss_function_using_logits, expected_train,
                           expected_test):
   attack_input = AttackInputData(
       logits_train=np.array([0, 0.]),
       logits_test=np.array([0, 0.]),
       probs_train=np.array([1, 1.]),
       probs_test=np.array([1, 1.]),
       labels_train=np.array([1, 0.]),
       labels_test=np.array([0, 2.]),
       loss_function=LossFunction.SQUARED,
       loss_function_using_logits=loss_function_using_logits,
   )
   np.testing.assert_allclose(attack_input.get_loss_train(), expected_train)
   np.testing.assert_allclose(attack_input.get_loss_test(), expected_test)
def run_attacks(attack_input: AttackInputData,
                slicing_spec: SlicingSpec = None,
                attack_types: Iterable[AttackType] = (
                    AttackType.THRESHOLD_ATTACK, ),
                privacy_report_metadata: PrivacyReportMetadata = None,
                balance_attacker_training: bool = True,
                min_num_samples: int = 1) -> AttackResults:
    """Runs membership inference attacks on a classification model.

  It runs attacks specified by attack_types on each attack_input slice which is
   specified by slicing_spec.

  Args:
    attack_input: input data for running an attack
    slicing_spec: specifies attack_input slices to run attack on
    attack_types: attacks to run
    privacy_report_metadata: the metadata of the model under attack.
    balance_attacker_training: Whether the training and test sets for the
      membership inference attacker should have a balanced (roughly equal)
      number of samples from the training and test sets used to develop the
      model under attack.
    min_num_samples: minimum number of examples in either training or test data.

  Returns:
    the attack result.
  """
    attack_input.validate()
    attack_results = []

    if slicing_spec is None:
        slicing_spec = SlicingSpec(entire_dataset=True)
    num_classes = None
    if slicing_spec.by_class:
        num_classes = attack_input.num_classes
    input_slice_specs = get_single_slice_specs(slicing_spec, num_classes)
    for single_slice_spec in input_slice_specs:
        attack_input_slice = get_slice(attack_input, single_slice_spec)
        for attack_type in attack_types:
            attack_result = _run_attack(attack_input_slice, attack_type,
                                        balance_attacker_training,
                                        min_num_samples)
            if attack_result is not None:
                attack_results.append(attack_result)

    privacy_report_metadata = _compute_missing_privacy_report_metadata(
        privacy_report_metadata, attack_input)

    return AttackResults(single_attack_results=attack_results,
                         privacy_report_metadata=privacy_report_metadata)
Exemple #15
0
def _slice_by_percentiles(data: AttackInputData, from_percentile: float,
                          to_percentile: float):
    """Slices samples by loss percentiles."""

    # Find from_percentile and to_percentile percentiles in losses.
    loss_train = data.get_loss_train()
    loss_test = data.get_loss_test()
    losses = np.concatenate((loss_train, loss_test))
    from_loss = np.percentile(losses, from_percentile)
    to_loss = np.percentile(losses, to_percentile)

    idx_train = (from_loss <= loss_train) & (loss_train <= to_loss)
    idx_test = (from_loss <= loss_test) & (loss_test <= to_loss)

    return _slice_data_by_indices(data, idx_train, idx_test)
Exemple #16
0
    def __init__(self, methodname):
        """Initialize the test class."""
        super().__init__(methodname)

        # Create test data for 3 class classification task.
        logits_train = np.array([[0, 1, 0], [2, 0, 3], [4, 5, 0], [6, 7, 0]])
        logits_test = np.array([[10, 0, 11], [12, 13, 0], [14, 15, 0],
                                [0, 16, 17]])
        probs_train = np.array([[0, 1, 0], [0.1, 0, 0.7], [0.4, 0.6, 0],
                                [0.3, 0.7, 0]])
        probs_test = np.array([[0.4, 0, 0.6], [0.1, 0.9, 0], [0.15, 0.85, 0],
                               [0, 0, 1]])
        labels_train = np.array([1, 0, 1, 2])
        labels_test = np.array([1, 2, 0, 2])
        loss_train = np.array([2, 0.25, 4, 3])
        loss_test = np.array([0.5, 3.5, 7, 4.5])
        entropy_train = np.array([0.4, 8, 0.6, 10])
        entropy_test = np.array([15, 10.5, 4.5, 0.3])

        self.input_data = AttackInputData(logits_train=logits_train,
                                          logits_test=logits_test,
                                          probs_train=probs_train,
                                          probs_test=probs_test,
                                          labels_train=labels_train,
                                          labels_test=labels_test,
                                          loss_train=loss_train,
                                          loss_test=loss_test,
                                          entropy_train=entropy_train,
                                          entropy_test=entropy_test)
Exemple #17
0
def get_test_input(n_train, n_test):
  """Get example inputs for attacks."""
  rng = np.random.RandomState(4)
  return AttackInputData(
      logits_train=rng.randn(n_train, 5) + 0.2,
      logits_test=rng.randn(n_test, 5) + 0.2,
      labels_train=np.array([i % 5 for i in range(n_train)]),
      labels_test=np.array([i % 5 for i in range(n_test)]))
Exemple #18
0
  def test_run_attack_threshold_entropy_calculates_correct_auc(self):
    result = mia._run_attack(
        AttackInputData(
            entropy_train=np.array([0.1, 0.2, 1.3, 0.4, 0.5, 0.6]),
            entropy_test=np.array([1.1, 1.2, 1.3, 0.4, 1.5, 1.6])),
        AttackType.THRESHOLD_ENTROPY_ATTACK)

    np.testing.assert_almost_equal(result.roc_curve.get_auc(), 0.83, decimal=2)
def _compute_missing_privacy_report_metadata(
        metadata: PrivacyReportMetadata,
        attack_input: AttackInputData) -> PrivacyReportMetadata:
    """Populates metadata fields if they are missing."""
    if metadata is None:
        metadata = PrivacyReportMetadata()
    if metadata.accuracy_train is None:
        metadata.accuracy_train = _get_accuracy(attack_input.logits_train,
                                                attack_input.labels_train)
    if metadata.accuracy_test is None:
        metadata.accuracy_test = _get_accuracy(attack_input.logits_test,
                                               attack_input.labels_test)
    loss_train = attack_input.get_loss_train()
    loss_test = attack_input.get_loss_test()
    if metadata.loss_train is None and loss_train is not None:
        metadata.loss_train = np.average(loss_train)
    if metadata.loss_test is None and loss_test is not None:
        metadata.loss_test = np.average(loss_test)
    return metadata
Exemple #20
0
  def test_run_compute_membership_probability_correct_probs(self):
    result = mia._compute_membership_probability(
        AttackInputData(
            loss_train=np.array([1, 1, 1, 10, 100]),
            loss_test=np.array([10, 100, 100, 1000, 10000])))

    np.testing.assert_almost_equal(
        result.train_membership_probs, [1, 1, 1, 0.5, 0.33], decimal=2)
    np.testing.assert_almost_equal(
        result.test_membership_probs, [0.5, 0.33, 0.33, 0, 0], decimal=2)
Exemple #21
0
  def test_get_binary_xe_loss_from_probs(self):
    attack_input = AttackInputData(
        probs_train=np.array([0.2, 0.7, 0.1, 0.99, 0.002, 0.008]),
        probs_test=np.array([0.2, 0.7, 0.1, 0.99, 0.002, 0.008]),
        labels_train=np.zeros((6,)),
        labels_test=np.ones((6,)),
        loss_function_using_logits=False)

    expected_loss0 = np.array([
        0.2231435513, 1.2039728043, 0.1053605157, 4.6051701860, 0.0020020027,
        0.0080321717
    ])
    expected_loss1 = np.array([
        1.6094379124, 0.3566749439, 2.3025850930, 0.0100503359, 6.2146080984,
        4.8283137373
    ])
    np.testing.assert_allclose(
        attack_input.get_loss_train(), expected_loss0, atol=1e-7)
    np.testing.assert_allclose(
        attack_input.get_loss_test(), expected_loss1, atol=1e-7)
Exemple #22
0
 def test_unbalanced_create_attacker_data_loss_and_logits(self):
     attack_input = AttackInputData(logits_train=np.array([[1, 2], [5, 6],
                                                           [8, 9]]),
                                    logits_test=np.array([[10, 11],
                                                          [14, 15]]),
                                    loss_train=np.array([3, 7, 10]),
                                    loss_test=np.array([12, 16]))
     attacker_data = models.create_attacker_data(attack_input, balance=True)
     self.assertLen(attacker_data.features_all, 5)
     self.assertLen(attacker_data.fold_indices, 4)
     self.assertLen(attacker_data.left_out_indices, 1)
     self.assertIn(attacker_data.left_out_indices[0], [0, 1, 2])
Exemple #23
0
 def test_balanced_create_attacker_data_loss_and_logits(self):
     attack_input = AttackInputData(logits_train=np.array([[1, 2], [5, 6],
                                                           [8, 9]]),
                                    logits_test=np.array([[10,
                                                           11], [14, 15],
                                                          [17, 18]]),
                                    loss_train=np.array([3, 7, 10]),
                                    loss_test=np.array([12, 16, 19]))
     attacker_data = models.create_attacker_data(attack_input)
     self.assertLen(attacker_data.features_all, 6)
     self.assertLen(attacker_data.fold_indices, 6)
     self.assertEmpty(attacker_data.left_out_indices)
Exemple #24
0
 def test_loss_wrong_input(self, logits, probs, loss_function_using_logits):
   attack_input = AttackInputData(
       logits_train=logits,
       logits_test=logits,
       probs_train=probs,
       probs_test=probs,
       labels_train=np.array([
           1.,
       ]),
       labels_test=np.array([0.]),
       loss_function_using_logits=loss_function_using_logits,
   )
   self.assertRaises(ValueError, attack_input.get_loss_train)
   self.assertRaises(ValueError, attack_input.get_loss_test)
Exemple #25
0
  def test_get_entropy(self):
    attack_input = AttackInputData(
        logits_train=np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]]),
        logits_test=np.array([[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]),
        labels_train=np.array([0, 2]),
        labels_test=np.array([0, 2]))

    np.testing.assert_equal(attack_input.get_entropy_train().tolist(), [0, 0])
    np.testing.assert_equal(attack_input.get_entropy_test().tolist(),
                            [2 * _log_value(0), 0])

    attack_input = AttackInputData(
        logits_train=np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]]),
        logits_test=np.array([[0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]))

    np.testing.assert_equal(attack_input.get_entropy_train().tolist(), [0, 0])
    np.testing.assert_equal(attack_input.get_entropy_test().tolist(), [0, 0])
Exemple #26
0
def run_attack_on_keras_model(
        model,
        in_train,
        out_train,
        slicing_spec: SlicingSpec = None,
        attack_types: Iterable[AttackType] = (AttackType.THRESHOLD_ATTACK, ),
        is_logit: bool = False,
        batch_size: int = 32):
    """Performs the attack on a trained model.

  Args:
    model: model to be tested
    in_train: a (in_training samples, in_training labels) tuple
    out_train: a (out_training samples, out_training labels) tuple
    slicing_spec: slicing specification of the attack
    attack_types: a list of attacks, each of type AttackType
    is_logit: whether the result of model.predict is logit or probability
    batch_size: the batch size for model.predict

  Returns:
    Results of the attack
  """
    in_train_data, in_train_labels = in_train
    out_train_data, out_train_labels = out_train

    # Compute predictions and losses
    in_train_pred, in_train_loss = calculate_losses(model, in_train_data,
                                                    in_train_labels, is_logit,
                                                    batch_size)
    out_train_pred, out_train_loss = calculate_losses(model, out_train_data,
                                                      out_train_labels,
                                                      is_logit, batch_size)
    attack_input = AttackInputData(logits_train=in_train_pred,
                                   logits_test=out_train_pred,
                                   labels_train=in_train_labels,
                                   labels_test=out_train_labels,
                                   loss_train=in_train_loss,
                                   loss_test=out_train_loss)
    results = mia.run_attacks(attack_input,
                              slicing_spec=slicing_spec,
                              attack_types=attack_types)
    return results
def _run_threshold_entropy_attack(attack_input: AttackInputData):
    ntrain, ntest = attack_input.get_train_size(), attack_input.get_test_size()
    fpr, tpr, thresholds = metrics.roc_curve(
        np.concatenate((np.zeros(ntrain), np.ones(ntest))),
        np.concatenate((attack_input.get_entropy_train(),
                        attack_input.get_entropy_test())))

    roc_curve = RocCurve(tpr=tpr, fpr=fpr, thresholds=thresholds)

    return SingleAttackResult(
        slice_spec=_get_slice_spec(attack_input),
        data_size=DataSize(ntrain=ntrain, ntest=ntest),
        attack_type=AttackType.THRESHOLD_ENTROPY_ATTACK,
        membership_scores_train=-attack_input.get_entropy_train(),
        membership_scores_test=-attack_input.get_entropy_test(),
        roc_curve=roc_curve)
Exemple #28
0
def run_seq2seq_attack(
        attack_input: Seq2SeqAttackInputData,
        privacy_report_metadata: PrivacyReportMetadata = None,
        balance_attacker_training: bool = True) -> AttackResults:
    """Runs membership inference attacks on a seq2seq model.

  Args:
    attack_input: input data for running an attack
    privacy_report_metadata: the metadata of the model under attack.
    balance_attacker_training: Whether the training and test sets for the
      membership inference attacker should have a balanced (roughly equal)
      number of samples from the training and test sets used to develop the
      model under attack.

  Returns:
    the attack result.
  """
    attack_input.validate()
    attack_input_train, loss_train, accuracy_train = _get_attack_features_and_metadata(
        attack_input.logits_train, attack_input.labels_train)
    attack_input_test, loss_test, accuracy_test = _get_attack_features_and_metadata(
        attack_input.logits_test, attack_input.labels_test)

    privacy_report_metadata = privacy_report_metadata or PrivacyReportMetadata(
    )
    privacy_report_metadata.loss_train = loss_train
    privacy_report_metadata.loss_test = loss_test
    privacy_report_metadata.accuracy_train = accuracy_train
    privacy_report_metadata.accuracy_test = accuracy_test

    # `attack_input_train` and `attack_input_test` contains the rank of the
    # ground-truth label in the logit, so smaller value means an example is
    # more likely a training example.
    return mia.run_attacks(AttackInputData(loss_train=attack_input_train,
                                           loss_test=attack_input_test),
                           attack_types=(AttackType.THRESHOLD_ATTACK, ),
                           privacy_report_metadata=privacy_report_metadata,
                           balance_attacker_training=balance_attacker_training)
Exemple #29
0
  def test_run_attack_trained_sets_membership_scores(self):
    attack_input = AttackInputData(
        logits_train=np.tile([500., -500.], (100, 1)),
        logits_test=np.tile([0., 0.], (50, 1)))

    result = mia._run_trained_attack(
        attack_input,
        AttackType.LOGISTIC_REGRESSION,
        balance_attacker_training=True)
    self.assertLen(result.membership_scores_train, 100)
    self.assertLen(result.membership_scores_test, 50)

    # Scores for all training (resp. test) examples should be close
    np.testing.assert_allclose(
        result.membership_scores_train,
        result.membership_scores_train[0],
        rtol=1e-3)
    np.testing.assert_allclose(
        result.membership_scores_test,
        result.membership_scores_test[0],
        rtol=1e-3)
    # Training score should be smaller than test score
    self.assertLess(result.membership_scores_train[0],
                    result.membership_scores_test[0])
def _run_threshold_attack(attack_input: AttackInputData):
    """Runs a threshold attack on loss."""
    ntrain, ntest = attack_input.get_train_size(), attack_input.get_test_size()
    loss_train = attack_input.get_loss_train()
    loss_test = attack_input.get_loss_test()
    if loss_train is None or loss_test is None:
        raise ValueError(
            'Not possible to run threshold attack without losses.')
    fpr, tpr, thresholds = metrics.roc_curve(
        np.concatenate((np.zeros(ntrain), np.ones(ntest))),
        np.concatenate((loss_train, loss_test)))

    roc_curve = RocCurve(tpr=tpr, fpr=fpr, thresholds=thresholds)

    return SingleAttackResult(
        slice_spec=_get_slice_spec(attack_input),
        data_size=DataSize(ntrain=ntrain, ntest=ntest),
        attack_type=AttackType.THRESHOLD_ATTACK,
        membership_scores_train=attack_input.get_loss_train(),
        membership_scores_test=attack_input.get_loss_test(),
        roc_curve=roc_curve)