def main(): X, y, Xt, yt = make_trans_moons(35, nb=150) Xall = np.vstack((X, Xt)) special = np.array([[-2.4, -1.6], [-1.2, 0.4], [.8, -.5], [2.5, 1.5]]) special_points = Xall[np.argmin(cdist(special, Xall), axis=1), :] # Standard NN algo = DANN(hidden_layer_size=15, maxiter=500, lambda_adapt=6., seed=42, adversarial_representation=False) algo.fit(X, y, Xt) pyplot.subplot(2, 4, 1) pyplot.title("NN: Label classification") draw_trans_data(X, y, Xt, algo.predict, special_points=special_points, special_xytext=[(40, -15), (-30, -80), (-50, 40), (-70, 0)]) pyplot.subplot(2, 4, 2) pyplot.title("NN: Representation PCA") run_pca(X, y, Xt, algo, special_points=special_points, mult=[-1, -1]) pyplot.subplot(2, 4, 3) pyplot.title("NN: Domain classification") draw_trans_data(X, y, Xt, algo.predict_domain, colormap_index=1) pyplot.subplot(2, 4, 4) pyplot.title("NN: Hidden neurons") draw_trans_data(X, y, Xt, neurons_to_draw=(algo.W, algo.b)) # DANN algo = DANN(hidden_layer_size=15, maxiter=500, lambda_adapt=6., seed=42) algo.fit(X, y, Xt) pyplot.subplot(2, 4, 5) pyplot.title("DANN: Label classification") draw_trans_data(X, y, Xt, algo.predict, special_points=special_points, special_xytext=[(50,-15), (-20,-90), (-50,40), (-80,0)] ) pyplot.subplot(2, 4, 6) pyplot.title("DANN: Representation PCA") run_pca(X, y, Xt, algo, special_points=special_points, mult=[-1,1], special_xytext=[(-10,-80), (50,-60), (-40,50), (-20,70)]) pyplot.subplot(2, 4, 7) pyplot.title("DANN: Domain classification") draw_trans_data(X, y, Xt, algo.predict_domain, colormap_index=1) pyplot.subplot(2, 4, 8) pyplot.title("DANN: Hidden neurons") draw_trans_data(X, y, Xt, neurons_to_draw=(algo.W, algo.b)) pyplot.show()
def main(): # training data x_train = [[1, 2], [3, 4], [-1, -2], [-3, -4]] y_train = [[0, 1], [0, 1], [1, 0], [1, 0]] dann = DANN() dann.fit(x_train, y_train) dann.evaluate(x_train, y_train)
def main(): data_folder = './data/' # where the datasets are source_name = 'dvd' # source domain: books, dvd, kitchen, or electronics target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics adversarial = False # set to False to learn a standard NN hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 maxiter = 200 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) for i in range(100000): for j in range(100000): algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print('==================================================================') print('Computing PAD on DANN representation...') pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True) print('PAD on DANN representation = %f' % pad_dann) print('==================================================================') print('Computing PAD on original data...') pad_original = compute_proxy_distance(xs, xt, verbose=True) print('PAD on original data = %f' % pad_original)
def main(): # open session with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess: model = DANN(sess) model.build_model() # you can choose source, target or dann. print('\nDomain adaptation training') source_acc, target_acc, d_acc, dann_emb, _ = model.train_and_evaluate( 'dann') print('Source (MNIST) accuracy:', source_acc) print('Target (MNIST-M) accuracy:', target_acc) print('Domain accuracy:', d_acc)
def use_DANN(key): # X_train, Y_train X_train, Y_train = [], [] # get training dataset for i in [0, 1]: f = open(DS_PATH.format(key, i) + '.txt', 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH, key) embs = embs[np.random.choice(len(embs), 110, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) f.close() X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) # X_valid, Y_valid raw_valid, X_valid = list(open( TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH + '.' + ARCH + '.npy') X_valid_b = X_valid if (IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) Y_valid = np.array([(key in x) for x in raw_valid]) clf = DANN(input_size=EMB_DIM_TABLE[ARCH], maxiter=DANN_MAXITER, verbose=False, name=key, batch_size=DANN_BATCH_SIZE, lambda_adapt=DANN_LAMBDA, hidden_layer_size=DANN_HIDDEN) # How to chose X_adapt? X_valid(after/before balanced), acc = clf.fit(X_train, Y_train, X_adapt=X_valid, X_valid=X_valid, Y_valid=Y_valid) return acc
def DANNA(key, size=2000): X, Y = [], [] for i in [0, 1]: # while my training data is from gpt f = open(DS_PATH.format(key, i) + '.txt', 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH) embs = embs[np.random. choice(len(embs), min(size, len(embs)), replace=False), :] X.append(embs) Y.extend([i] * embs.shape[0]) X = np.concatenate(X, axis=0) Y = np.array(Y) train_embs = np.load(TRAIN_EMB_PATH + '.' + ARCH + '.npy') # load validation set (let us load gpt2) raw_valid, X_valid = list(open( TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH + '.' + ARCH + '.npy') # the potato case is somehow necessary, because it is the case where all the answers should be negative if (key != 'potato'): raw_valid, X_valid = balance(key, raw_valid, X_valid) print(len(raw_valid)) Y_valid = np.array([(key in x) for x in raw_valid]) # learn a transfer clf = DANN(input_size=EMB_DIM_TABLE[ARCH], maxiter=4000, verbose=VERBOSE, name=key, batch_size=BATCH_SIZE, lambda_adapt=LAMDA, hidden_layer_size=HIDDEN) acc = clf.fit(X, Y, X_adapt=train_embs, X_valid=X_valid, Y_valid=Y_valid) return acc
dataset_name, noise=0.5, suffix='t') xtest, ytest = load_representations(context_folder, dataset_name, noise=0.5, suffix='test') ys = (ys + 1) / 2 ytest = (ytest + 1) / 2 nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest))
def train_atk_classifier(key, size=1900): pca = None X_train, Y_train = [], [] for i in [0, 1]: f = open(PATH.format(key, i), 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, EMB_PATH.format(key, i), ARCH) if args.prefix != 'part': embs = embs[np.random.choice(len(embs), size, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) train_embs = np.load(TRAIN_EMB_PATH) # BottleNeck # X_train = np.load(TRAIN_EMB_PATH) # raw_train = list(open(TRAIN_PATH, 'r')) # if IS_BALANCED: # raw_train, X_train = balance(key, raw_train, X_train) # Y_train = np.array([(key in x) for x in raw_train]) # load validation set raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH) if (key != 'potato' and IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) print(len(raw_valid)) Y_valid = np.array([(key in x) for x in raw_valid]) acc = -1 # learn a transfer # clf = linear_model.SGDClassifier(max_iter = 1000, verbose = 0) # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False) # clf = KNeighborsClassifier(n_neighbors=1, p = 1) if (NONLINEAR): # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128) clf = DANN(input_size=EMB_DIM, maxiter=4000, verbose=True, name=key, batch_size=64, lambda_adapt=1.0, hidden_layer_size=25) acc = clf.fit(X_train, Y_train, X_adapt=train_embs, X_valid=X_valid, Y_valid=Y_valid) print("DANN Acc.: {:.4f}".format(acc)) # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :] # # apply pca first # if(DO_PCA): # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)] # package = np.concatenate([X_train, train_embs], axis = 0) # pca = PCA(n_components=INPUT_DIM) # pca.fit(package) # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs) # if NONLINEAR: # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA) # clf.fit(X_train, Y_train) if NONLINEAR: clf.to(torch.device('cpu')) # on current set # correct = 0 if (VERBOSE): print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format( len(X_train))) correct = np.sum((clf.predict(X_train) == Y_train)) print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train))) return clf, pca, acc
def ATTACK(key, use_dp=False, defense=None, verbose=VERBOSE, size=2000): # (X, Y) is from external corpus. # X are sentence embeddings. Y are labels. # To prepare an external corpus, we substitute the food keywords in Yelp dataset to body keywords. ## GET THE TRAINING DATA, NO NEED TO DEFEND X, Y = [], [] mean_direction = [] for i in [0, 1]: # while my training data is from gpt f = open(DS_PATH.format(key, i) + '.txt', 'r') sents = [x[:-1] for x in f if x[:-1] != ''] print(DS_EMB_PATH.format(key, i)) embs = embedding(sents, DS_EMB_PATH.format(key, i), ARCH) embs = embs[np.random. choice(len(embs), min(size, len(embs)), replace=False), :] mean_direction.append(np.mean(embs, axis=0)) X.append(embs) Y.extend([i] * embs.shape[0]) X = np.concatenate(X, axis=0) Y = np.array(Y) trans_D = mean_direction[1] - mean_direction[0] # print(trans_D) # (Target_sents, Target_X) is from target domain. # Target_X are sentence embeddings. Target_sents are original sentences. f = open(TRAIN_PATH, 'r') Target_sents = [x[:-1] for x in f if x[:-1] != ''] f.close() # trunk DEFEND Target_X = embedding(Target_sents, TRAIN_EMB_PATH, ARCH) rand_idx = np.random.permutation(Target_X.shape[0]) shuffled_target_X = Target_X[rand_idx, :] if (not NO_BALANCE): Target_sents, Target_X = balance(key, Target_sents, Target_X) else: Target_X = Target_X[rand_idx, :] Target_sents = [Target_sents[i] for i in rand_idx] Target_X = Target_X[:1000, :] Target_sents = Target_sents[:1000] # print(Target_sents[0]) Target_Y = np.array([int(key in x) for x in Target_sents]) sents = [x.split('\t') for x in Target_sents if x[:-1] != ''] # print(sents) # print(sents[0]) if (DATASET == 'medical'): target_util_y = np.array([int(s[1]) for s in sents]) else: target_util_y = np.array([0 for s in sents]) # print("Balanced: {}".format(np.mean(Target_Y))) # now the target Y here is the sensitive label if (use_dp): protected_target_X = defense(Target_X, Target_Y) # print(Target_X[0, :]) # print(torch.sum(protected_target_X[0, :])) # (X_valid, Y_valid) is from valid set. # SVM: This is regarded as shadow corpus of Target domain. # DANN or MLP: This is used to early stop. # X_valid are sentence embeddings. Y_valid are labels. raw_valid, X_valid = list(open( TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH + '.' + ARCH + '.npy') if (not NO_BALANCE): raw_valid, X_valid = balance(key, raw_valid, X_valid) Y_valid = np.array([int(key in x) for x in raw_valid]) # load the utility model if (DATASET == 'medical'): util_clf = NonLinearClassifier(key, EMB_DIM_TABLE[ARCH], HIDDEN_DIM, cls_num=10) # util_clf.load_state_dict(torch.load(UTIL_MODEL_PATH + "medical_functional_{}.cpt".format(ARCH))) util_clf.cuda() preds = util_clf.predict(Target_X) util_acc = np.mean(preds == target_util_y) # print("Util Acc. {:.4f}".format(acc)) if (use_dp): protected_target_X = torch.FloatTensor(protected_target_X) preds = util_clf.predict(protected_target_X) protected_util_acc = np.mean(preds == target_util_y) if (VERBOSE): print("TRAINING SET SIZE: {}".format(len(Y))) print("EMBEDDINGS FROM TARGET DOMAIN: {}".format(len(Target_Y))) print("TEST SET SIZE: {}".format(len(Y_valid))) # learn a transfer print("TESTING MODEL: {}".format(CLS)) acc, protected_acc = 0.0, 0.0 util_acc, protected_util_acc = 0.0, 0.0 if CLS == 'MLP': print("Histogram of the Target Y: {}".format(np.histogram(Target_Y))) clf = NonLinearClassifier(key, EMB_DIM_TABLE[ARCH], HIDDEN_DIM) clf.cuda() clf.fit(X, Y) # assume the existence of the model acc = clf._evaluate(Target_X, Target_Y) if (use_dp): protected_target_X = torch.FloatTensor(protected_target_X) protected_acc = clf._evaluate(protected_target_X, Target_Y) elif CLS == 'SVM': # for discussion REVERSE = False # shadow clf = SVC(kernel='{}'.format(SVM_KERNEL), gamma='scale', verbose=VERBOSE, max_iter=5000) # print(X_valid) # print(Y_valid) if (REVERSE): clf.fit(Target_X, Target_Y) else: start = time.time() clf.fit(X_valid, Y_valid) end = time.time() print("Time: {}".format(end - start)) # if(defense): # the common approach if (REVERSE): preds = clf.predict(X_valid) acc = np.mean(preds == Y_valid) else: preds = clf.predict(Target_X) acc = np.mean(preds == Target_Y) # print(acc) if (use_dp): preds = clf.predict(protected_target_X) protected_acc = np.mean(preds == Target_Y) elif CLS == 'DANN': # I have no idea whether the 1000 is. DANN_CPT_PATHs = DANN_CPT_PATH + "{}_cracker_{}.cpt".format(key, ARCH) clf = DANN(input_size=EMB_DIM_TABLE[ARCH], maxiter=MAXITER, verbose=VERBOSE, name=key, batch_size=BATCH_SIZE, lambda_adapt=LAMDA, hidden_layer_size=HIDDEN, cached=DANN_CACHED, cpt_path=DANN_CPT_PATHs) # clf.cuda() # set the size of the Target_X trans_D = 0.5 * trans_D concated_Target_X = np.concatenate( [Target_X - trans_D, Target_X + trans_D], axis=0) clf.fit(X, Y, X_adapt=concated_Target_X, X_valid=Target_X - trans_D, Y_valid=Target_Y) Target_X = torch.FloatTensor(Target_X - trans_D) acc = clf.validate(Target_X, Target_Y) # print(acc) if (use_dp): protected_target_X = torch.FloatTensor(protected_target_X).cuda() protected_acc = clf.validate(protected_target_X, Target_Y) # print("Target Domain Inference {} Acc: {:.3f}".format(key, acc)) # return acc elif CLS == 'MLP_SHADOW': clf = NonLinearClassifier(key, EMB_DIM_TABLE[ARCH], HIDDEN_DIM) clf.cuda() clf.fit(X_valid, Y_valid) acc = clf._evaluate(Target_X, Target_Y) else: clf = None print('wrong cls\' name') return acc, protected_acc, util_acc, protected_util_acc
def main(): data_folder = './data/' # where the datasets are source_name = 'books' # source domain: books, dvd, kitchen, or electronics target_name = 'data' # traget domain: books, dvd, kitchen, or electronics adversarial = True # set to False to learn a standard NN msda = True hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 if not msda else 0.0001 maxiter = 100 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) if msda: xs_path, xt_path, xtest_path = [ '%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E) for E in ('source', 'target', 'test') ] # try: # xs_msda = np.load(xs_path) # xt_msda = np.load(xt_path) # xtest_msda = np.load(xtest_path) # print('mSDA representations loaded from disk') # except: print('Computing mSDA representations...') xs_msda, xt_msda, xtest_msda = compute_msda_representation( xs, xt, xtest) ds, ns = np.shape(xs_msda) print 'shape(xs_msda)' print ds, ns # print xs_msda dt, nt = np.shape(xt_msda) print 'shape(xt_msda)' print dt, nt # print xt_msda dxtest, nxtest = np.shape(xtest_msda) print 'shape(xtest_msda)' print dxtest, nxtest # np.save(xs_path, xs_msda) # np.save(xt_path, xt_msda) # np.save(xtest_path, xtest_msda) xs, xt, xtest = xs_msda, xt_msda, xtest_msda nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) va = count(prediction_valid) print '验证集正向的预测标签比率' print va[0], "%.2f%%" % ((float(va[0]) / len(prediction_valid)) * 100) print '验证集负向的预测标签比率' print va[1], "%.2f%%" % ((float(va[1]) / len(prediction_valid)) * 100) te = count(prediction_test) print '测试集正向的预测标签比率' print te[0], "%.2f%%" % ((float(te[0]) / len(prediction_test)) * 100) print '测试集负向的预测标签比率' print te[1], "%.2f%%" % ((float(te[1]) / len(prediction_test)) * 100) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print '验证集正向和负向的预测准确率' print compare(yv, prediction_valid) print '测试集正向和负向的预测准确率' print compare(ytest, prediction_test) print('==================================================================')
def main(): data_folder = './data/' # where the datasets are source_name = 'dvd' # source domain: books, dvd, kitchen, or electronics target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics adversarial = False # set to False to learn a standard NN msda = True hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 if not msda else 0.0001 maxiter = 200 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) if msda: xs_path, xt_path, xtest_path = ['%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E) for E in ('source', 'target', 'test')] try: xs_msda = np.load(xs_path) xt_msda = np.load(xt_path) xtest_msda = np.load(xtest_path) print('mSDA representations loaded from disk') except: print('Computing mSDA representations...') xs_msda, xt_msda, xtest_msda = compute_msda_representation(xs, xt, xtest) np.save(xs_path, xs_msda) np.save(xt_path, xt_msda) np.save(xtest_path, xtest_msda) xs, xt, xtest = xs_msda, xt_msda, xtest_msda nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print('==================================================================') print('Computing PAD on DANN representation...') pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True) print('PAD on DANN representation = %f' % pad_dann) print('==================================================================') print('Computing PAD on original data...') pad_original = compute_proxy_distance(xs, xt, verbose=True) print('PAD on original data = %f' % pad_original)