Esempio n. 1
0
def train_atk_classifier(key, size=1900):
    pca = None
    X_train, Y_train = [], []

    for i in [0, 1]:
        f = open(PATH.format(key, i), 'r')
        sents = [x[:-1] for x in f if x[:-1] != '']
        embs = embedding(sents, EMB_PATH.format(key, i), ARCH)
        if args.prefix != 'part':
            embs = embs[np.random.choice(len(embs), size, replace=False), :]
        X_train.append(embs)
        Y_train.extend([i] * embs.shape[0])
    X_train = np.concatenate(X_train, axis=0)
    Y_train = np.array(Y_train)
    train_embs = np.load(TRAIN_EMB_PATH)

    # BottleNeck
    # X_train = np.load(TRAIN_EMB_PATH)
    # raw_train = list(open(TRAIN_PATH, 'r'))
    # if IS_BALANCED:
    # raw_train, X_train = balance(key, raw_train, X_train)
    # Y_train = np.array([(key in x) for x in raw_train])

    # load validation set

    raw_valid, X_valid = list(open(TARGET_PATH, 'r')), np.load(TARGET_EMB_PATH)
    if (key != 'potato' and IS_BALANCED):
        raw_valid, X_valid = balance(key, raw_valid, X_valid)
    print(len(raw_valid))
    Y_valid = np.array([(key in x) for x in raw_valid])
    acc = -1
    # learn a transfer

    # clf = linear_model.SGDClassifier(max_iter = 1000,  verbose = 0)
    # clf = SVC(kernel = 'rbf', gamma = 'scale', verbose = False)
    # clf = KNeighborsClassifier(n_neighbors=1, p = 1)
    if (NONLINEAR):
        # clf = DANN(input_size = EMB_DIM, maxiter = 2000, verbose = False, name = key, batch_size = 128)
        clf = DANN(input_size=EMB_DIM,
                   maxiter=4000,
                   verbose=True,
                   name=key,
                   batch_size=64,
                   lambda_adapt=1.0,
                   hidden_layer_size=25)
        acc = clf.fit(X_train,
                      Y_train,
                      X_adapt=train_embs,
                      X_valid=X_valid,
                      Y_valid=Y_valid)
        print("DANN Acc.: {:.4f}".format(acc))
    # train_embs = train_embs[np.random.choice(len(train_embs), 2000), :]

    # # apply pca first
    # if(DO_PCA):
    # train_embs = train_embs[np.random.choice(len(train_embs), size = 6 * int(len(X_train)), replace = False)]
    # package = np.concatenate([X_train, train_embs], axis = 0)
    # pca = PCA(n_components=INPUT_DIM)
    # pca.fit(package)
    # X_train, train_embs = pca.transform(X_train), pca.transform(train_embs)

    # if NONLINEAR:
    # clf = NonLinearClassifier(key, ARCH, cls_num = 2, pca = pca, use_pca = DO_PCA)

    # clf.fit(X_train, Y_train)

    if NONLINEAR:
        clf.to(torch.device('cpu'))
    # on current set
    # correct = 0
    if (VERBOSE):
        print("TRAIN INFERENCE MODEL FROM EXTERNAL SOURCES (# = {})".format(
            len(X_train)))
        correct = np.sum((clf.predict(X_train) == Y_train))
        print("Source Domain Acc.: {:.4f}".format(correct / len(Y_train)))
    return clf, pca, acc