def main(): data_folder = './data/' # where the datasets are source_name = 'dvd' # source domain: books, dvd, kitchen, or electronics target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics adversarial = False # set to False to learn a standard NN hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 maxiter = 200 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) for i in range(100000): for j in range(100000): algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print('==================================================================') print('Computing PAD on DANN representation...') pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True) print('PAD on DANN representation = %f' % pad_dann) print('==================================================================') print('Computing PAD on original data...') pad_original = compute_proxy_distance(xs, xt, verbose=True) print('PAD on original data = %f' % pad_original)
def main(): data_folder = './data/' # where the datasets are source_name = 'dvd' # source domain: books, dvd, kitchen, or electronics target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics adversarial = False # set to False to learn a standard NN hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 maxiter = 200 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print('==================================================================') print('Computing PAD on DANN representation...') pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True) print('PAD on DANN representation = %f' % pad_dann) print('==================================================================') print('Computing PAD on original data...') pad_original = compute_proxy_distance(xs, xt, verbose=True) print('PAD on original data = %f' % pad_original)
dataset_name, noise=0.5, suffix='t') xtest, ytest = load_representations(context_folder, dataset_name, noise=0.5, suffix='test') ys = (ys + 1) / 2 ytest = (ytest + 1) / 2 nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest))
def train_atk_classifier(key, size=1900): pca = None X_train, Y_train = [], [] for i in [0, 1]: f = open(PATH.format(key, i), 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, EMB_PATH.format(key, i), ARCH) if args.prefix != 'part': embs = embs[np.random.choice(len(embs), size, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) train_embs = np.load(TRAIN_EMB_PATH) # BottleNeck # X_train = np.load(TRAIN_EMB_PATH) # raw_train = list(open(TRAIN_PATH, 'r')) # if IS_BALANCED: # raw_train, X_train = balance(key, raw_train, X_train) # Y_train = np.array([(key in x) for x in raw_train]) # load validation set raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH) if (key != 'potato' and IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) print(len(raw_valid)) Y_valid = np.array([(key in x) for x in raw_valid]) acc = -1 # learn a transfer # clf = linear_model.SGDClassifier(max_iter = 1000, verbose = 0) # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False) # clf = KNeighborsClassifier(n_neighbors=1, p = 1) if (NONLINEAR): # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128) clf = DANN(input_size=EMB_DIM, maxiter=4000, verbose=True, name=key, batch_size=64, lambda_adapt=1.0, hidden_layer_size=25) acc = clf.fit(X_train, Y_train, X_adapt=train_embs, X_valid=X_valid, Y_valid=Y_valid) print("DANN Acc.: {:.4f}".format(acc)) # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :] # # apply pca first # if(DO_PCA): # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)] # package = np.concatenate([X_train, train_embs], axis = 0) # pca = PCA(n_components=INPUT_DIM) # pca.fit(package) # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs) # if NONLINEAR: # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA) # clf.fit(X_train, Y_train) if NONLINEAR: clf.to(torch.device('cpu')) # on current set # correct = 0 if (VERBOSE): print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format( len(X_train))) correct = np.sum((clf.predict(X_train) == Y_train)) print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train))) return clf, pca, acc
def main(): data_folder = './data/' # where the datasets are source_name = 'books' # source domain: books, dvd, kitchen, or electronics target_name = 'data' # traget domain: books, dvd, kitchen, or electronics adversarial = True # set to False to learn a standard NN msda = True hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 if not msda else 0.0001 maxiter = 100 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) if msda: xs_path, xt_path, xtest_path = [ '%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E) for E in ('source', 'target', 'test') ] # try: # xs_msda = np.load(xs_path) # xt_msda = np.load(xt_path) # xtest_msda = np.load(xtest_path) # print('mSDA representations loaded from disk') # except: print('Computing mSDA representations...') xs_msda, xt_msda, xtest_msda = compute_msda_representation( xs, xt, xtest) ds, ns = np.shape(xs_msda) print 'shape(xs_msda)' print ds, ns # print xs_msda dt, nt = np.shape(xt_msda) print 'shape(xt_msda)' print dt, nt # print xt_msda dxtest, nxtest = np.shape(xtest_msda) print 'shape(xtest_msda)' print dxtest, nxtest # np.save(xs_path, xs_msda) # np.save(xt_path, xt_msda) # np.save(xtest_path, xtest_msda) xs, xt, xtest = xs_msda, xt_msda, xtest_msda nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) va = count(prediction_valid) print '验证集正向的预测标签比率' print va[0], "%.2f%%" % ((float(va[0]) / len(prediction_valid)) * 100) print '验证集负向的预测标签比率' print va[1], "%.2f%%" % ((float(va[1]) / len(prediction_valid)) * 100) te = count(prediction_test) print '测试集正向的预测标签比率' print te[0], "%.2f%%" % ((float(te[0]) / len(prediction_test)) * 100) print '测试集负向的预测标签比率' print te[1], "%.2f%%" % ((float(te[1]) / len(prediction_test)) * 100) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print '验证集正向和负向的预测准确率' print compare(yv, prediction_valid) print '测试集正向和负向的预测准确率' print compare(ytest, prediction_test) print('==================================================================')
def main(): data_folder = './data/' # where the datasets are source_name = 'dvd' # source domain: books, dvd, kitchen, or electronics target_name = 'electronics' # traget domain: books, dvd, kitchen, or electronics adversarial = False # set to False to learn a standard NN msda = True hidden_layer_size = 50 lambda_adapt = 0.1 if adversarial else 0. learning_rate = 0.001 if not msda else 0.0001 maxiter = 200 print("Loading data...") xs, ys, xt, _, xtest, ytest = load_amazon(source_name, target_name, data_folder, verbose=True) if msda: xs_path, xt_path, xtest_path = ['%s/%s.%s_%s_msda.npy' % (data_folder, source_name, target_name, E) for E in ('source', 'target', 'test')] try: xs_msda = np.load(xs_path) xt_msda = np.load(xt_path) xtest_msda = np.load(xtest_path) print('mSDA representations loaded from disk') except: print('Computing mSDA representations...') xs_msda, xt_msda, xtest_msda = compute_msda_representation(xs, xt, xtest) np.save(xs_path, xs_msda) np.save(xt_path, xt_msda) np.save(xtest_path, xtest_msda) xs, xt, xtest = xs_msda, xt_msda, xtest_msda nb_valid = int(0.1 * len(ys)) xv, yv = xs[-nb_valid:, :], ys[-nb_valid:] xs, ys = xs[0:-nb_valid, :], ys[0:-nb_valid] print("Fit...") algo = DANN(lambda_adapt=lambda_adapt, hidden_layer_size=hidden_layer_size, learning_rate=learning_rate, maxiter=maxiter, epsilon_init=None, seed=12342, adversarial_representation=adversarial, verbose=True) algo.fit(xs, ys, xt, xv, yv) print("Predict...") prediction_train = algo.predict(xs) prediction_valid = algo.predict(xv) prediction_test = algo.predict(xtest) print('Training Risk = %f' % np.mean(prediction_train != ys)) print('Validation Risk = %f' % np.mean(prediction_valid != yv)) print('Test Risk = %f' % np.mean(prediction_test != ytest)) print('==================================================================') print('Computing PAD on DANN representation...') pad_dann = compute_proxy_distance(algo.hidden_representation(xs), algo.hidden_representation(xt), verbose=True) print('PAD on DANN representation = %f' % pad_dann) print('==================================================================') print('Computing PAD on original data...') pad_original = compute_proxy_distance(xs, xt, verbose=True) print('PAD on original data = %f' % pad_original)