Esempio n. 1
0
def validate(X, y, model, k=5, run_all=True):
    """
    input:
        df_train (dataframe): train set
        target (series): target variable
        k: number of folds for k-fold (default: 5)
        run_all (bool): True for running all folds, False for running one fold.
    output:
        list_logloss: list of log loss values (validation) for each fold
        list_logloss_train: list of log loss values (train set) for each fold
    """
    # Perform k-fold validation
    kf = KFold(X.shape[0], k)

    list_logloss = []
    list_logloss_train = []
    if run_all:
        for i, (train_index, val_index) in enumerate(kf):
            print "   validation set", i+1, "started."
            X_train = X[train_index]
            y_train = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
            model.fit(X_train, y_train)
            probs = model.predict_proba(X_val)[:,1]
            probs_train = model.predict_proba(X_train)[:,1]
            list_logloss.append(log_loss(y_val,  probs))
            list_logloss_train.append(log_loss(y_train,  probs_train))
    return list_logloss, list_logloss_train
Esempio n. 2
0
def validate(X, y, model, k=5, run_all=True):
    """
    input:
        df_train (dataframe): train set
        target (series): target variable
        k: number of folds for k-fold (default: 5)
        run_all (bool): True for running all folds, False for running one fold.
    output:
        list_logloss: list of log loss values (validation) for each fold
        list_logloss_train: list of log loss values (train set) for each fold
    """
    # Perform k-fold validation
    kf = KFold(X.shape[0], k)

    list_logloss = []
    list_logloss_train = []
    if run_all:
        for i, (train_index, val_index) in enumerate(kf):
            print "   validation set", i + 1, "started."
            X_train = X[train_index]
            y_train = y[train_index]
            X_val = X[val_index]
            y_val = y[val_index]
            model.fit(X_train, y_train)
            probs = model.predict_proba(X_val)[:, 1]
            probs_train = model.predict_proba(X_train)[:, 1]
            list_logloss.append(log_loss(y_val, probs))
            list_logloss_train.append(log_loss(y_train, probs_train))
    return list_logloss, list_logloss_train
Esempio n. 3
0
def yeom_attribute_inference(true_x,
                             true_y,
                             classifier,
                             membership,
                             features,
                             train_loss,
                             test_loss=None):
    print('-' * 10 + 'YEOM\'S ATTRIBUTE INFERENCE' + '-' * 10 + '\n')
    pred_membership_all = []
    for feature in features:
        orignial_attribute = np.copy(true_x[:, feature])
        low_value, high_value, true_attribute_value = get_attribute_variations(
            true_x, feature)

        true_x[:, feature] = low_value
        pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
            x={'x': true_x}, num_epochs=1, shuffle=False)
        predictions = classifier.predict(input_fn=pred_input_fn)
        _, low_op = get_predictions(predictions)
        low_op = low_op.astype('float32')
        low_op = log_loss(true_y, low_op)

        true_x[:, feature] = high_value
        pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
            x={'x': true_x}, num_epochs=1, shuffle=False)
        predictions = classifier.predict(input_fn=pred_input_fn)
        _, high_op = get_predictions(predictions)
        high_op = high_op.astype('float32')
        high_op = log_loss(true_y, high_op)

        high_prob = np.sum(true_attribute_value) / len(true_attribute_value)
        low_prob = 1 - high_prob

        if test_loss == None:
            pred_attribute_value = np.where(
                low_prob * stats.norm(0, train_loss).pdf(low_op) >=
                high_prob * stats.norm(0, train_loss).pdf(high_op), 0, 1)
            mask = [1] * len(pred_attribute_value)
        else:
            low_mem = np.where(
                stats.norm(0, train_loss).pdf(low_op) >= stats.norm(
                    0, test_loss).pdf(low_op), 1, 0)
            high_mem = np.where(
                stats.norm(0, train_loss).pdf(high_op) >= stats.norm(
                    0, test_loss).pdf(high_op), 1, 0)
            pred_attribute_value = [
                np.argmax([low_prob * a, high_prob * b])
                for a, b in zip(low_mem, high_mem)
            ]
            mask = [a | b for a, b in zip(low_mem, high_mem)]

        pred_membership = mask & (pred_attribute_value ^ true_attribute_value
                                  ^ [1] * len(pred_attribute_value))
        prety_print_result(membership, pred_membership)
        pred_membership_all.append(pred_membership)
        true_x[:, feature] = orignial_attribute
    return pred_membership_all
Esempio n. 4
0
def proposed_membership_inference(v_dataset, true_x, true_y, classifier,
                                  per_instance_loss, args):
    print('-' * 10 + 'PROPOSED MEMBERSHIP INFERENCE' + '-' * 10 + '\n')
    v_train_x, v_train_y, v_test_x, v_test_y = v_dataset
    v_true_x = np.vstack([v_train_x, v_test_x])
    v_true_y = np.concatenate([v_train_y, v_test_y])
    v_pred_y, v_membership, v_test_classes, v_classifier, aux = train_target_model(
        dataset=v_dataset,
        epochs=args.target_epochs,
        batch_size=args.target_batch_size,
        learning_rate=args.target_learning_rate,
        n_hidden=args.target_n_hidden,
        l2_ratio=args.target_l2_ratio,
        model=args.target_model,
        privacy=args.target_privacy,
        dp=args.target_dp,
        epsilon=args.target_epsilon,
        delta=args.target_delta,
        save=args.save_model)
    v_per_instance_loss = np.array(log_loss(v_true_y, v_pred_y))
    noise_params = (args.attack_noise_type, args.attack_noise_coverage,
                    args.attack_noise_magnitude)
    v_counts = loss_increase_counts(v_true_x, v_true_y, v_classifier,
                                    v_per_instance_loss, noise_params)
    counts = loss_increase_counts(true_x, true_y, classifier,
                                  per_instance_loss, noise_params)
    return (true_y, v_true_y, v_membership, v_per_instance_loss, v_counts,
            counts)
Esempio n. 5
0
def proposed_attribute_inference(true_x, true_y, classifier, membership,
                                 features, args):
    print('-' * 10 + 'PROPOSED ATTRIBUTE INFERENCE' + '-' * 10 + '\n')
    low_per_instance_loss_all, high_per_instance_loss_all = [], []
    low_counts_all, high_counts_all = [], []
    true_attribute_value_all = []
    for feature in features:
        orignial_attribute = np.copy(true_x[:, feature])
        low_value, high_value, true_attribute_value = get_attribute_variations(
            true_x, feature)
        noise_params = (args.attack_noise_type, args.attack_noise_coverage,
                        args.attack_noise_magnitude)

        true_x[:, feature] = low_value
        pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
            x={'x': true_x}, num_epochs=1, shuffle=False)
        predictions = classifier.predict(input_fn=pred_input_fn)
        _, low_op = get_predictions(predictions)
        low_op = low_op.astype('float32')
        low_op = log_loss(true_y, low_op)
        low_counts = loss_increase_counts(true_x, true_y, classifier, low_op,
                                          noise_params)

        true_x[:, feature] = high_value
        pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
            x={'x': true_x}, num_epochs=1, shuffle=False)
        predictions = classifier.predict(input_fn=pred_input_fn)
        _, high_op = get_predictions(predictions)
        high_op = high_op.astype('float32')
        high_op = log_loss(true_y, high_op)
        high_counts = loss_increase_counts(true_x, true_y, classifier, high_op,
                                           noise_params)

        true_attribute_value_all.append(true_attribute_value)
        low_per_instance_loss_all.append(low_op)
        high_per_instance_loss_all.append(high_op)
        low_counts_all.append(low_counts)
        high_counts_all.append(high_counts)
        true_x[:, feature] = orignial_attribute
    return (true_attribute_value_all, low_per_instance_loss_all,
            high_per_instance_loss_all, low_counts_all, high_counts_all)
Esempio n. 6
0
def loss_increase_counts(true_x, true_y, classifier, per_instance_loss, noise_params, max_t=100):
    counts = np.zeros(len(true_x))
    for t in range(max_t):
        noisy_x = true_x + generate_noise(true_x.shape, true_x.dtype, noise_params)
        pred_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
            x={'x': noisy_x}, 
           num_epochs=1,
            shuffle=False)
        predictions = classifier.predict(input_fn=pred_input_fn)
        _, pred_y = get_predictions(predictions)
        noisy_per_instance_loss = np.array(log_loss(true_y, pred_y))
        counts += np.where(noisy_per_instance_loss > per_instance_loss, 1, 0)
    return counts
Esempio n. 7
0
def run_experiment(args):
    print('-' * 10 + 'TRAIN TARGET' + '-' * 10 + '\n')
    dataset = load_data('target_data.npz', args)
    v_dataset = load_data('shadow0_data.npz', args)
    train_x, train_y, test_x, test_y = dataset
    true_x = np.vstack((train_x, test_x))
    true_y = np.append(train_y, test_y)
    batch_size = args.target_batch_size

    pred_y, membership, test_classes, classifier, aux = train_target_model(
        args=args,
        dataset=dataset,
        epochs=args.target_epochs,
        batch_size=args.target_batch_size,
        learning_rate=args.target_learning_rate,
        clipping_threshold=args.target_clipping_threshold,
        n_hidden=args.target_n_hidden,
        l2_ratio=args.target_l2_ratio,
        model=args.target_model,
        privacy=args.target_privacy,
        dp=args.target_dp,
        epsilon=args.target_epsilon,
        delta=args.target_delta,
        save=args.save_model)
    train_loss, train_acc, test_loss, test_acc = aux
    per_instance_loss = np.array(log_loss(true_y, pred_y))
   
    # Yeom's membership inference attack when only train_loss is known 
    yeom_mi_outputs_1 = yeom_membership_inference(per_instance_loss, membership, train_loss)
    # Yeom's membership inference attack when both train_loss and test_loss are known - Adversary 2 of Yeom et al.
    yeom_mi_outputs_2 = yeom_membership_inference(per_instance_loss, membership, train_loss, test_loss)

    # Proposed membership inference attacks
    proposed_mi_outputs = proposed_membership_inference(v_dataset, true_x, true_y, classifier, per_instance_loss, args)
    evaluate_proposed_membership_inference(per_instance_loss, membership, proposed_mi_outputs, fpr_threshold=0.01)
    evaluate_proposed_membership_inference(per_instance_loss, membership, proposed_mi_outputs, fpr_threshold=0.01, per_class_thresh=True)

    if not os.path.exists(RESULT_PATH+args.train_dataset):
        os.makedirs(RESULT_PATH+args.train_dataset)
    
    if args.target_privacy == 'no_privacy':
        pickle.dump([aux, membership, per_instance_loss, yeom_mi_outputs_1, yeom_mi_outputs_2, proposed_mi_outputs], open(RESULT_PATH+args.train_dataset+'/'+str(args.target_test_train_ratio)+'_'+args.target_model+'_'+args.target_privacy+'_'+str(args.target_l2_ratio)+'_'+str(args.run)+'.p', 'wb'))	
    else:
        pickle.dump([aux, membership, per_instance_loss, yeom_mi_outputs_1, yeom_mi_outputs_2, proposed_mi_outputs], open(RESULT_PATH+args.train_dataset+'/'+str(args.target_test_train_ratio)+'_'+args.target_model+'_'+args.target_privacy+'_'+args.target_dp+'_'+str(args.target_epsilon)+'_'+str(args.run)+'.p', 'wb'))
Esempio n. 8
0
def run_experiment(args):
    print('-' * 10 + 'TRAIN TARGET' + '-' * 10 + '\n')
    dataset = load_data('target_data.npz', args)
    train_x, train_y, test_x, test_y = dataset
    true_x = np.vstack((train_x, test_x))
    true_y = np.append(train_y, test_y)
    batch_size = args.target_batch_size

    pred_y, membership, test_classes, classifier, aux = train_target_model(
        args=args,
        dataset=dataset,
        epochs=args.target_epochs,
        batch_size=args.target_batch_size,
        learning_rate=args.target_learning_rate,
        clipping_threshold=args.target_clipping_threshold,
        n_hidden=args.target_n_hidden,
        l2_ratio=args.target_l2_ratio,
        model=args.target_model,
        privacy=args.target_privacy,
        dp=args.target_dp,
        epsilon=args.target_epsilon,
        delta=args.target_delta,
        save=args.save_model)
    train_loss, train_acc, test_loss, test_acc = aux
    per_instance_loss = np.array(log_loss(true_y, pred_y))

    features = get_random_features(true_x, range(true_x.shape[1]), 5)
    print(features)

    # Yeom's membership inference attack when only train_loss is known
    pred_membership = yeom_membership_inference(per_instance_loss, membership,
                                                train_loss)
    fpr, tpr, thresholds = roc_curve(membership, pred_membership, pos_label=1)
    yeom_mem_adv = tpr[1] - fpr[1]

    # Shokri's membership inference attack based on shadow model training
    shokri_mem_adv, shokri_mem_confidence = shokri_membership_inference(
        args, pred_y, membership, test_classes)

    # Yeom's attribute inference attack when train_loss is known - Adversary 4 of Yeom et al.
    pred_membership_all = yeom_attribute_inference(true_x, true_y, classifier,
                                                   membership, features,
                                                   train_loss)
    yeom_attr_adv = []
    for pred_membership in pred_membership_all:
        fpr, tpr, thresholds = roc_curve(membership,
                                         pred_membership,
                                         pos_label=1)
        yeom_attr_adv.append(tpr[1] - fpr[1])

    if not os.path.exists(RESULT_PATH + args.train_dataset):
        os.makedirs(RESULT_PATH + args.train_dataset)

    if args.target_privacy == 'no_privacy':
        pickle.dump(
            [
                train_acc, test_acc, train_loss, membership, shokri_mem_adv,
                shokri_mem_confidence, yeom_mem_adv, per_instance_loss,
                yeom_attr_adv, pred_membership_all, features
            ],
            open(
                RESULT_PATH + args.train_dataset + '/' + args.target_model +
                '_' + 'no_privacy_' + str(args.l2_ratio) + '.p', 'wb'))
    else:
        pickle.dump(
            [
                train_acc, test_acc, train_loss, membership, shokri_mem_adv,
                shokri_mem_confidence, yeom_mem_adv, per_instance_loss,
                yeom_attr_adv, pred_membership_all, features
            ],
            open(
                RESULT_PATH + args.train_dataset + '/' + args.target_model +
                '_' + args.target_privacy + '_' + args.target_dp + '_' +
                str(args.target_epsilon) + '_' + str(args.run) + '.p', 'wb'))
############################
# test log loss
print('computing log loss')
kf = cross_validation.KFold(ntrain, n_folds=4)

_logloss = 0.0
for trainIndex, testIndex in kf:
    print("TRAIN:", trainIndex, "TEST:", testIndex)
    X_train, X_test = X[trainIndex], X[testIndex]
    y_train, y_test = y[trainIndex], y[testIndex]

    clf.fit(X_train, y_train)
    pred = clf.predict_proba(X_test)

    _logloss += utilities.log_loss(pred, y_test)

print('log loss = ', _logloss/len(kf))
############################

clf.fit(X, y)

print('training completed')

del X
del y

# Dimensions for train set
ntest = 10873
nfeature = 16 ** 2 + 1  # For two_byte_codes, no_que_marks
test, Ids = utilities.read_test(ntest, nfeature, ftest)