def test_gcn_v(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim)
    oprefix = osp.join(cfg.work_dir, folder)
    oname = osp.basename(rm_suffix(cfg.load_from))
    opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname))

    if osp.isfile(opath_pred_confs) and not cfg.force:
        data = np.load(opath_pred_confs)
        pred_confs = data['pred_confs']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(opath_pred_confs, inst_num, len(dataset)))
    else:
        pred_confs, gcn_feat = test(model, dataset, cfg, logger)
        inst_num = dataset.inst_num

    logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format(
        pred_confs.mean(), pred_confs.max(), pred_confs.min()))

    logger.info('Convert to cluster')
    with Timer('Predition to peaks'):
        pred_dist2peak, pred_peaks = confidence_to_peaks(
            dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn)

    if not dataset.ignore_label and cfg.eval_interim:
        # evaluate the intermediate results
        for i in range(cfg.max_conn):
            num = len(dataset.peaks)
            pred_peaks_i = np.arange(num)
            peaks_i = np.arange(num)
            for j in range(num):
                if len(pred_peaks[j]) > i:
                    pred_peaks_i[j] = pred_peaks[j][i]
                if len(dataset.peaks[j]) > i:
                    peaks_i[j] = dataset.peaks[j][i]
            acc = accuracy(pred_peaks_i, peaks_i)
            logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format(
                i + 1, acc))
            acc = 0.
            for idx, peak in enumerate(pred_peaks_i):
                acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx])
            acc /= len(pred_peaks_i)
            logger.info(
                '[{}-th conn] accuracy of peak label match: {:.4f}'.format(
                    i + 1, acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0,
                                      inst_num)

    if cfg.save_output:
        logger.info('save predicted confs to {}'.format(opath_pred_confs))
        mkdir_if_no_exists(opath_pred_confs)
        np.savez_compressed(opath_pred_confs,
                            pred_confs=pred_confs,
                            inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        opath_pred_labels = osp.join(
            cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0))
        logger.info('save predicted labels to {}'.format(opath_pred_labels))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

    if cfg.use_gcn_feat:
        # gcn_feat is saved to disk for GCN-E
        opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname))
        if not osp.isfile(opath_feat) or cfg.force:
            mkdir_if_no_exists(opath_feat)
            write_feat(opath_feat, gcn_feat)

        name = rm_suffix(osp.basename(opath_feat))
        prefix = oprefix
        ds = BasicDataset(name=name,
                          prefix=prefix,
                          dim=cfg.model['kwargs']['nhid'],
                          normalize=True)
        ds.info()

        # use top embedding of GCN to rebuild the kNN graph
        with Timer('connect to higher confidence with use_gcn_feat'):
            knn_prefix = osp.join(prefix, 'knns', name)
            knns = build_knns(knn_prefix,
                              ds.features,
                              cfg.knn_method,
                              cfg.knn,
                              is_rebuild=True)
            dists, nbrs = knns2ordered_nbrs(knns)

            pred_dist2peak, pred_peaks = confidence_to_peaks(
                dists, nbrs, pred_confs, cfg.max_conn)
            pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                          inst_num)

        # save clustering results
        if cfg.save_output:
            oname_meta = '{}_gcn_feat'.format(name)
            opath_pred_labels = osp.join(
                oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau))
            mkdir_if_no_exists(opath_pred_labels)

            idx2lb = list2dict(pred_labels, ignore_value=-1)
            write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

        # evaluation

        if not dataset.ignore_label:
            print('==> evaluation')
            for metric in cfg.metrics:
                evaluate(dataset.gt_labels, pred_labels, metric)
        import json
        import os
        import pdb
        pdb.set_trace()
        img_labels = json.load(
            open(r'/home/finn/research/data/clustering_data/test_index.json',
                 'r',
                 encoding='utf-8'))
        import shutil
        output = r'/home/finn/research/data/clustering_data/mr_gcn_output'
        for label in set(pred_labels):
            if not os.path.exists(os.path.join(output, f'cluter_{label}')):
                os.mkdir(os.path.join(output, f'cluter_{label}'))
        for image in img_labels:
            shutil.copy2(
                image,
                os.path.join(
                    os.path.join(output,
                                 f'cluter_{pred_labels[img_labels[image]]}'),
                    os.path.split(image)[-1]))
Example #2
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)
        self.conf_metric = cfg.get('conf_metric')

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1 # take the entire graph as input

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                knns = np.load(knn_graph_path)['data']
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                
                prefix = osp.dirname(feat_path)
                name = rm_suffix(osp.basename(feat_path))
                # find root folder of `features`
                prefix = osp.dirname(prefix)
                knn_prefix = osp.join(prefix, 'knns', name)
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn)

            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns)

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        if not self.ignore_label:
            with Timer('Prepare ground-truth label'):
                self.labels = confidence(feats=self.features,
                                         dists=self.dists,
                                         nbrs=self.nbrs,
                                         metric=self.conf_metric,
                                         idx2lb=self.idx2lb,
                                         lb2idxs=self.lb2idxs)
                if cfg.eval_interim:
                    _, self.peaks = confidence_to_peaks(
                        self.dists, self.nbrs, self.labels, self.max_conn)
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)
        self.conf_metric = cfg.get('conf_metric')
        self.num_process = cfg.get('num_process',16)

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1 # take the entire graph as input

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                knns = np.load(knn_graph_path)['data']    # num_imgs*2*k
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name)
                # 通过faiss实现k近邻搜索,此处作者faiss_gpu版本实现可能有问题,但faiss大规模在cpu上跑还是慢
                # 当然faiss有针内存和计算速度方面的优化,PQ,IVF等,可参考faiss
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn,self.num_process)
            # 依据k近邻搜索结果构建邻接矩阵
            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns)  # num_imgs*k

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        if not self.ignore_label:
            with Timer('Prepare ground-truth label'):
                self.labels = confidence(feats=self.features,
                                         dists=self.dists,
                                         nbrs=self.nbrs,
                                         metric=self.conf_metric,
                                         idx2lb=self.idx2lb,
                                         lb2idxs=self.lb2idxs)
                if cfg.eval_interim:
                    _, self.peaks = confidence_to_peaks(
                        self.dists, self.nbrs, self.labels, self.max_conn)