def train_atk_classifier(key, size=1900): pca = None X_train, Y_train = [], [] for i in [0, 1]: f = open(PATH.format(key, i), 'r') sents = [x[:-1] for x in f if x[:-1] != ''] embs = embedding(sents, EMB_PATH.format(key, i), ARCH) if args.prefix != 'part': embs = embs[np.random.choice(len(embs), size, replace=False), :] X_train.append(embs) Y_train.extend([i] * embs.shape[0]) X_train = np.concatenate(X_train, axis=0) Y_train = np.array(Y_train) train_embs = np.load(TRAIN_EMB_PATH) # BottleNeck # X_train = np.load(TRAIN_EMB_PATH) # raw_train = list(open(TRAIN_PATH, 'r')) # if IS_BALANCED: # raw_train, X_train = balance(key, raw_train, X_train) # Y_train = np.array([(key in x) for x in raw_train]) # load validation set raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH) if (key != 'potato' and IS_BALANCED): raw_valid, X_valid = balance(key, raw_valid, X_valid) print(len(raw_valid)) Y_valid = np.array([(key in x) for x in raw_valid]) acc = -1 # learn a transfer # clf = linear_model.SGDClassifier(max_iter = 1000, verbose = 0) # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False) # clf = KNeighborsClassifier(n_neighbors=1, p = 1) if (NONLINEAR): # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128) clf = DANN(input_size=EMB_DIM, maxiter=4000, verbose=True, name=key, batch_size=64, lambda_adapt=1.0, hidden_layer_size=25) acc = clf.fit(X_train, Y_train, X_adapt=train_embs, X_valid=X_valid, Y_valid=Y_valid) print("DANN Acc.: {:.4f}".format(acc)) # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :] # # apply pca first # if(DO_PCA): # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)] # package = np.concatenate([X_train, train_embs], axis = 0) # pca = PCA(n_components=INPUT_DIM) # pca.fit(package) # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs) # if NONLINEAR: # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA) # clf.fit(X_train, Y_train) if NONLINEAR: clf.to(torch.device('cpu')) # on current set # correct = 0 if (VERBOSE): print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format( len(X_train))) correct = np.sum((clf.predict(X_train) == Y_train)) print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train))) return clf, pca, acc