Example #1
0
class TestKnnClassifier(unittest.TestCase):

    def setUp(self):
        algorithm = CoverTreeAlgorithm(euclidian_distance)
        self.classifier = KNNClassifier(algorithm, NEIGHBOURS_COUNT)
        self.classifier.register_feature(TestFeature)

    def build_object_description(self, number):
        features = (TestFeature(number),)
        return ObjectDescription(features)

    def check_fixture(self, fixture):
        train_set = fixture['train_set']
        for number, label in train_set:
            self.classifier.train(number, label)

        label = self.classifier.classify(fixture['query'])
        self.assertEqual(fixture['expected_result'], label)

    def test_classifier(self):
        for fixture in FIXTURES:
            self.check_fixture(fixture)
Example #2
0
        if y_true[i] == pos:
            TP += 1
        else:
            FP += 1
    FPR = FP / num_neg
    TPR = TP / num_pos
    if verbose: print("{},{}".format(FPR, TPR))


if __name__ == "__main__":

    if len(sys.argv) == 1:
        k = 30
        train = "datasets/votes_train.json"
        test = "datasets/votes_test.json"
    else:
        k = int(sys.argv[1])
        train = str(sys.argv[2])
        test = str(sys.argv[3])

    # parse the json files for data
    X_train, y_train, meta_train = parse_json(train)
    X_test, y_test, meta_test = parse_json(test)

    # fit KNN and predict confidence
    knn = KNNClassifier(k=k)
    knn.fit(X_train, y_train, meta_train)
    y_conf = knn.predict(X_test, verbose=False, confidence=True)

    roc_curve(y_test, y_conf, meta_test, verbose=True)
Example #3
0
 def setUp(self):
     algorithm = CoverTreeAlgorithm(euclidian_distance)
     self.classifier = KNNClassifier(algorithm, NEIGHBOURS_COUNT)
     self.classifier.register_feature(TestFeature)
Example #4
0
    # Create the output file
    try:
        file_name = "{}/output_knn.txt".format(args.outdir)
        f_out = open(file_name, 'w')
    except IOError:
        print("Output file {} cannot be created".format(file_name))
        sys.exit(1)

    # Write header for output file
    f_out.write('{}\t{}\t{}\t{}\n'.format('Value of k', 'Accuracy',
                                          'Precision', 'Recall'))

    ############################## KNN algorithm ####################################

    # Create the k-NN object.
    knn = KNNClassifier(train_X[:, 1:], train_y[:, 1:], metric='euclidean')

    # Iterate through all possible values of k:
    for k in range(min_k, max_k + 1):

        knn.set_k(k)

        # 1. Perform KNN training and classify all the test points. In this step, you will
        # obtain a prediction for each test point.

        y_pred = []

        for i in range(test_X.shape[0]):
            result = knn.predict(test_X[i, 1:])
            if result:
                y_pred.append(result)
Example #5
0
 def cf_gzsl(test_x, test_l, split):
     preds = []
     truths = []
     test_l_np = test_l.cpu().numpy()
     test_l_binary = np.array(
         [y in data.unseenclasses for y in test_l])
     if additional_train:
         gen_sx, gen_sl = generate_syn_feature(
             self.netG,
             self.data.seenclasses,
             self.data.attribute,
             100,
             netF=self.netF,
             netDec=self.netDec,
             opt=opt)
         #gen_sx = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
         #gen_sx2 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
         #gen_sx3 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
         #gen_sx = torch.cat((gen_sx, gen_sx2, gen_sx3), 0)
         #gen_sl = torch.cat((data.train_label.cuda(), data.train_label.cuda(), data.train_label.cuda()), 0)
     for i in range(test_x.shape[0]):
         gen_x, gen_l = self.generate_syn_feature_cf(
             test_x[i],
             data.unseenclasses,
             deterministic=deterministic)
         if use_train:
             #if additional_train:
             #    train_x = torch.cat((gen_sx, gen_x), 0)
             #    train_y = torch.cat((gen_sl, gen_l), 0)
             #else:
             train_x = torch.cat((data.train_feature, gen_x), 0)
             train_y = torch.cat((data.train_label.cuda(), gen_l),
                                 0)
         else:
             gen_s_x, gen_s_l = self.generate_syn_feature_cf(
                 test_x[i],
                 data.seenclasses,
                 deterministic=deterministic)
             train_x = torch.cat((gen_s_x, gen_x), 0)
             train_y = torch.cat((gen_s_l, gen_l), 0)
         if additional_train:
             train_x = torch.cat((train_x, gen_sx), 0)
             train_y = torch.cat((train_y, gen_sl.cuda()), 0)
         if softmax_clf:
             if not binary:
                 clf = classifier.CLASSIFIER(train_x, train_y, data, self.opt.nclass_all, opt.cuda, opt.classifier_lr, opt.beta1,\
                     self.epoch, opt.syn_num, generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize,
                     dec_hidden_size=4096, x=test_x[i], use_tde=use_tde, alpha=self.alpha)
                 if self.test_logits is None:
                     self.test_logits = clf.logits
                 else:
                     self.test_logits = np.concatenate(
                         (self.test_logits, clf.logits), axis=0)
             else:
                 clf = BINARY_CLASSIFIER(train_x,
                                         train_y,
                                         data,
                                         2,
                                         True,
                                         opt.classifier_lr,
                                         0.5,
                                         self.epoch,
                                         opt.syn_num,
                                         netDec=self.cls_netDec,
                                         dec_size=opt.attSize,
                                         dec_hidden_size=4096,
                                         use_tde=use_tde,
                                         alpha=self.alpha,
                                         x=test_x[i])
             pred = clf.pred
             truths.append(test_l_np[i])
             preds.append(pred.item())
         else:
             clf = KNNClassifier(train_x,
                                 train_y,
                                 test_x[i].unsqueeze(0),
                                 self.cls_netDec,
                                 dec_size=opt.attSize,
                                 dec_hidden_size=4096,
                                 batch_size=100)
             pred = clf.fit()[0]
             preds.append(pred)
             truths.append(test_l_np[i])
         if (i + 1) % 500 == 0:
             if not binary:
                 binary_acc = self.get_binary_acc(truths, preds)
                 print("%s-%dth acc: %.3f, binary acc: %.3f" %
                       (split, i + 1,
                        cal_macc(truth=truths,
                                 pred=preds), binary_acc))
             else:
                 test_l_binary_t = test_l_binary[:len(preds
                                                      )].astype(int)
                 preds_np = np.array(preds)
                 acc = (preds_np == test_l_binary_t).mean()
                 print("%s-%dth binary acc: %.3f" %
                       (split, i + 1, acc))
         if self.opt.sanity:
             break  # Sanity check
     if not binary:
         acc = cal_macc(truth=truths, pred=preds)
         binary_acc = self.get_binary_acc(truths, preds)
         clf_results = {"truths": truths, "preds": preds}
     else:
         acc = (np.array(preds) == test_l_binary.astype(int)).mean()
         binary_acc = acc
         clf_results = {"truths": test_l_binary, "preds": preds}
     save_file = self.get_save_result_file(split)
     if self.log_to_file:
         with open(save_file, 'wb') as handle:
             pickle.dump(clf_results, handle)
     return acc, binary_acc
Example #6
0
    def gzsl(self,
             use_train,
             softmax_clf,
             cf,
             deterministic=False,
             additional_train=False,
             use_tde=False,
             binary=False):
        opt = self.opt
        data = self.data
        if self.siamese:
            clf = SiameseClassifier(data,
                                    opt,
                                    self.netE,
                                    self.netG,
                                    self.netF,
                                    self.cls_netDec,
                                    dec_size=opt.attSize,
                                    cf=cf,
                                    n_epochs=opt.clf_epoch,
                                    distance="l1")
            if self.netS is None:
                clf.train()
            else:
                clf.network = self.netS
                s_acc, u_acc = clf.validate(gzsl=True)
        if not cf:
            with torch.no_grad():
                gen_x, gen_l = generate_syn_feature(self.netG,
                                                    self.data.unseenclasses,
                                                    self.data.attribute,
                                                    opt.syn_num,
                                                    netF=self.netF,
                                                    netDec=self.netDec,
                                                    opt=opt)
            if use_train:
                train_x = torch.cat((data.train_feature, gen_x), 0)
                train_y = torch.cat((data.train_label, gen_l), 0)
            else:
                with torch.no_grad():
                    gen_s_x, gen_s_l = generate_syn_feature(
                        self.netG,
                        self.data.seenclasses,
                        self.data.attribute,
                        opt.syn_num,
                        netF=self.netF,
                        netDec=self.netDec,
                        opt=opt)
                train_x = torch.cat((gen_s_x, gen_x), 0)
                train_y = torch.cat((gen_s_l, gen_l), 0)
            if softmax_clf:
                if not binary:
                    gzsl_cls = classifier.CLASSIFIER(train_x, train_y, \
                                data, data.allclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, self.epoch, opt.syn_num,
                                generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096,
                                use_tde=use_tde, alpha=self.alpha)
                    self.test_logits = gzsl_cls.all_outputs
                else:
                    gzsl_cls = BINARY_CLASSIFIER(train_x,
                                                 train_y,
                                                 data,
                                                 2,
                                                 True,
                                                 opt.classifier_lr,
                                                 0.5,
                                                 self.epoch,
                                                 opt.syn_num,
                                                 netDec=self.cls_netDec,
                                                 dec_size=opt.attSize,
                                                 dec_hidden_size=4096,
                                                 use_tde=use_tde,
                                                 alpha=self.alpha)
                s_acc = gzsl_cls.acc_seen
                u_acc = gzsl_cls.acc_unseen
                h_acc = gzsl_cls.H
                self.s_bacc = gzsl_cls.s_bacc
                self.u_bacc = gzsl_cls.u_bacc
                if not binary:
                    clf_results = {"preds": gzsl_cls.pred_s.cpu().numpy()}
                save_file = self.get_save_result_file("seen")
                if self.log_to_file and not binary:
                    with open(save_file, 'wb') as handle:
                        pickle.dump(clf_results, handle)
                if not binary:
                    clf_results = {"preds": gzsl_cls.pred_u.cpu().numpy()}
                save_file = self.get_save_result_file("unseen")
                if self.log_to_file and not binary:
                    with open(save_file, 'wb') as handle:
                        pickle.dump(clf_results, handle)
            else:
                u_cls = KNNClassifier(train_x,
                                      train_y,
                                      data.test_unseen_feature,
                                      self.cls_netDec,
                                      dec_size=opt.attSize,
                                      dec_hidden_size=4096,
                                      batch_size=100)
                preds = u_cls.fit()
                truths = data.test_unseen_label.cpu().numpy()
                u_acc = cal_macc(truth=truths, pred=preds)

                s_cls = KNNClassifier(train_x,
                                      train_y,
                                      data.test_seen_feature,
                                      self.cls_netDec,
                                      dec_size=opt.attSize,
                                      dec_hidden_size=4096,
                                      batch_size=100)
                preds = s_cls.fit()
                truths = data.test_seen_label.cpu().numpy()
                s_acc = cal_macc(truth=truths, pred=preds)
                h_acc = 2 * u_acc * s_acc / (u_acc + s_acc)
        else:
            self.test_logits = None

            def cf_gzsl(test_x, test_l, split):
                preds = []
                truths = []
                test_l_np = test_l.cpu().numpy()
                test_l_binary = np.array(
                    [y in data.unseenclasses for y in test_l])
                if additional_train:
                    gen_sx, gen_sl = generate_syn_feature(
                        self.netG,
                        self.data.seenclasses,
                        self.data.attribute,
                        100,
                        netF=self.netF,
                        netDec=self.netDec,
                        opt=opt)
                    #gen_sx = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
                    #gen_sx2 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
                    #gen_sx3 = self.conditional_sample(data.train_feature, data.attribute[data.train_label], deterministic=False)
                    #gen_sx = torch.cat((gen_sx, gen_sx2, gen_sx3), 0)
                    #gen_sl = torch.cat((data.train_label.cuda(), data.train_label.cuda(), data.train_label.cuda()), 0)
                for i in range(test_x.shape[0]):
                    gen_x, gen_l = self.generate_syn_feature_cf(
                        test_x[i],
                        data.unseenclasses,
                        deterministic=deterministic)
                    if use_train:
                        #if additional_train:
                        #    train_x = torch.cat((gen_sx, gen_x), 0)
                        #    train_y = torch.cat((gen_sl, gen_l), 0)
                        #else:
                        train_x = torch.cat((data.train_feature, gen_x), 0)
                        train_y = torch.cat((data.train_label.cuda(), gen_l),
                                            0)
                    else:
                        gen_s_x, gen_s_l = self.generate_syn_feature_cf(
                            test_x[i],
                            data.seenclasses,
                            deterministic=deterministic)
                        train_x = torch.cat((gen_s_x, gen_x), 0)
                        train_y = torch.cat((gen_s_l, gen_l), 0)
                    if additional_train:
                        train_x = torch.cat((train_x, gen_sx), 0)
                        train_y = torch.cat((train_y, gen_sl.cuda()), 0)
                    if softmax_clf:
                        if not binary:
                            clf = classifier.CLASSIFIER(train_x, train_y, data, self.opt.nclass_all, opt.cuda, opt.classifier_lr, opt.beta1,\
                                self.epoch, opt.syn_num, generalized=True, netDec=self.cls_netDec, dec_size=opt.attSize,
                                dec_hidden_size=4096, x=test_x[i], use_tde=use_tde, alpha=self.alpha)
                            if self.test_logits is None:
                                self.test_logits = clf.logits
                            else:
                                self.test_logits = np.concatenate(
                                    (self.test_logits, clf.logits), axis=0)
                        else:
                            clf = BINARY_CLASSIFIER(train_x,
                                                    train_y,
                                                    data,
                                                    2,
                                                    True,
                                                    opt.classifier_lr,
                                                    0.5,
                                                    self.epoch,
                                                    opt.syn_num,
                                                    netDec=self.cls_netDec,
                                                    dec_size=opt.attSize,
                                                    dec_hidden_size=4096,
                                                    use_tde=use_tde,
                                                    alpha=self.alpha,
                                                    x=test_x[i])
                        pred = clf.pred
                        truths.append(test_l_np[i])
                        preds.append(pred.item())
                    else:
                        clf = KNNClassifier(train_x,
                                            train_y,
                                            test_x[i].unsqueeze(0),
                                            self.cls_netDec,
                                            dec_size=opt.attSize,
                                            dec_hidden_size=4096,
                                            batch_size=100)
                        pred = clf.fit()[0]
                        preds.append(pred)
                        truths.append(test_l_np[i])
                    if (i + 1) % 500 == 0:
                        if not binary:
                            binary_acc = self.get_binary_acc(truths, preds)
                            print("%s-%dth acc: %.3f, binary acc: %.3f" %
                                  (split, i + 1,
                                   cal_macc(truth=truths,
                                            pred=preds), binary_acc))
                        else:
                            test_l_binary_t = test_l_binary[:len(preds
                                                                 )].astype(int)
                            preds_np = np.array(preds)
                            acc = (preds_np == test_l_binary_t).mean()
                            print("%s-%dth binary acc: %.3f" %
                                  (split, i + 1, acc))
                    if self.opt.sanity:
                        break  # Sanity check
                if not binary:
                    acc = cal_macc(truth=truths, pred=preds)
                    binary_acc = self.get_binary_acc(truths, preds)
                    clf_results = {"truths": truths, "preds": preds}
                else:
                    acc = (np.array(preds) == test_l_binary.astype(int)).mean()
                    binary_acc = acc
                    clf_results = {"truths": test_l_binary, "preds": preds}
                save_file = self.get_save_result_file(split)
                if self.log_to_file:
                    with open(save_file, 'wb') as handle:
                        pickle.dump(clf_results, handle)
                return acc, binary_acc

            s_acc, s_bacc = cf_gzsl(data.test_seen_feature,
                                    data.test_seen_label, "seen")
            u_acc, u_bacc = cf_gzsl(data.test_unseen_feature,
                                    data.test_unseen_label, "unseen")

            # s_acc = 0.3
            if u_acc + s_acc == 0:
                h_acc = 0
            else:
                h_acc = 2 * u_acc * s_acc / (u_acc + s_acc)
            self.s_bacc = s_bacc
            self.u_bacc = u_bacc
        return s_acc, u_acc, h_acc
Example #7
0
 def zsl(self, softmax_clf, cf, deterministic=False):
     opt = self.opt
     data = self.data
     if not cf:
         with torch.no_grad():
             gen_x, gen_l = generate_syn_feature(self.netG,
                                                 self.data.unseenclasses,
                                                 self.data.attribute,
                                                 opt.syn_num,
                                                 netF=self.netF,
                                                 netDec=self.netDec,
                                                 opt=opt)
         if softmax_clf:
             zsl_cls = classifier.CLASSIFIER(gen_x, util.map_label(gen_l, data.unseenclasses), \
                             data, data.unseenclasses.size(0), opt.cuda, opt.classifier_lr, 0.5, self.epoch, opt.syn_num, \
                             generalized=False, netDec=self.cls_netDec, dec_size=opt.attSize, dec_hidden_size=4096)
             acc = zsl_cls.acc
         else:
             zsl_cls = KNNClassifier(gen_x,
                                     gen_l,
                                     data.test_unseen_feature,
                                     self.cls_netDec,
                                     dec_size=opt.attSize,
                                     dec_hidden_size=4096,
                                     batch_size=100)
             preds = zsl_cls.fit()
             truths = data.test_unseen_label.cpu().numpy()
             acc = cal_macc(truth=truths, pred=preds)
     else:
         preds = []
         truths = []
         test_x = data.test_unseen_feature
         mapped_unseen_l = util.map_label(data.test_unseen_label,
                                          data.unseenclasses)
         unseen_label_np = data.test_unseen_label.cpu().numpy()
         for i in range(test_x.shape[0]):
             gen_x, gen_l = self.generate_syn_feature_cf(
                 test_x[i], data.unseenclasses, deterministic=deterministic)
             gen_l = util.map_label(gen_l, data.unseenclasses)
             if softmax_clf:
                 clf = classifier.CLASSIFIER(gen_x,
                                             gen_l,
                                             data,
                                             data.unseenclasses.size(0),
                                             opt.cuda,
                                             opt.classifier_lr,
                                             0.5,
                                             self.epoch,
                                             opt.syn_num,
                                             generalized=False,
                                             netDec=self.cls_netDec,
                                             dec_size=opt.attSize,
                                             dec_hidden_size=4096,
                                             x=test_x[i])
                 pred = clf.pred
                 truths.append(mapped_unseen_l[i])
                 preds.append(pred)
             else:
                 clf = KNNClassifier(gen_x,
                                     gen_l,
                                     test_x[i].unsqueeze(0),
                                     self.cls_netDec,
                                     dec_size=opt.attSize,
                                     dec_hidden_size=4096,
                                     batch_size=100)
                 pred = clf.fit()[0]
                 preds.append(pred)
                 truths.append(unseen_label_np[i])
             if (i + 1) % 500 == 0:
                 print("%dth acc: %.3f" %
                       (i + 1, cal_macc(truth=truths, pred=preds)))
             if self.opt.sanity:
                 break  # Sanity check
         acc = cal_macc(truth=truths, pred=preds)
     return acc
        test = "datasets/digits_test.json"
    else:
        max_k = int(sys.argv[1])
        train = str(sys.argv[2])
        val = str(sys.argv[3])
        test = str(sys.argv[4])

    # parse the json files for data
    X_train, y_train, meta_train = parse_json(train)
    X_val, y_val, meta_val = parse_json(val)
    X_test, y_test, meta_test = parse_json(test)

    # train classifier on TRAIN, predict on VAL (for k=1,2,...,max_k)
    acc = {}
    for k in range(1, max_k + 1):
        knn = KNNClassifier(k=k)
        knn.fit(X_train, y_train, meta_train)
        y_pred = knn.predict(X_val, verbose=False)

        acc[k] = accuracy_score(y_val, y_pred)
        print("{},{}".format(k, acc[k]))

    best_k = max(acc, key=lambda key: acc[
        key])  # note that 'max' always returns first value in case of ties
    print(best_k)

    # train on TRAIN + VAL, predict on TEST
    knn_best = KNNClassifier(k=best_k)

    X_train_val = pd.concat([X_train, X_val], ignore_index=True)
    y_train_val = pd.concat([y_train, y_val], ignore_index=True)
import pandas as pd

if __name__ == "__main__":

    if len(sys.argv) == 1:
        k = 10
        train = "datasets/votes_train.json"
        test = "datasets/votes_test.json"
    else:
        k = int(sys.argv[1])
        train = str(sys.argv[2])
        test = str(sys.argv[3])

    # parse the json files for data
    X_train, y_train, meta_train = parse_json(train)
    X_test, y_test, meta_test = parse_json(test)

    for i in range(10):
        N = X_train.shape[0]
        ind = math.floor(
            (i + 1) * N / 10 - 1)  # subtract 1 since indexing starts at 0

        knn = KNNClassifier(k=k)
        knn.fit(X_train.ix[0:ind, :], y_train.ix[0:ind], meta_train)
        y_pred = knn.predict(X_test, verbose=False)

        acc = accuracy_score(y_test, y_pred)

        print(X_train.ix[0:ind, :].shape[0], end="")
        print(",{}".format(acc))