def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)

    # set cuda
    cfg.cuda = not args.no_cuda and torch.cuda.is_available()

    # set cudnn_benchmark & cudnn_deterministic
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    if cfg.get('cudnn_deterministic', False):
        torch.backends.cudnn.deterministic = True

    # update configs according to args
    if not hasattr(cfg, 'work_dir'):
        if args.work_dir is not None:
            cfg.work_dir = args.work_dir
        else:
            cfg_name = rm_suffix(os.path.basename(args.config))
            cfg.work_dir = os.path.join('./data/work_dir', cfg_name)
    mkdir_if_no_exists(cfg.work_dir, is_folder=True)

    cfg.load_from = args.load_from
    cfg.resume_from = args.resume_from

    cfg.gpus = args.gpus
    cfg.distributed = args.distributed

    cfg.random_conns = args.random_conns
    cfg.eval_interim = args.eval_interim
    cfg.save_output = args.save_output
    cfg.force = args.force

    for data in ['train_data', 'test_data']:
        if not hasattr(cfg, data):
            continue
        cfg[data].eval_interim = cfg.eval_interim
        # import pdb
        # pdb.set_trace()
        if not hasattr(cfg[data], 'knn_graph_path') or not os.path.isfile(
                cfg[data].knn_graph_path):
            cfg[data].prefix = cfg.prefix
            cfg[data].knn = cfg.knn
            cfg[data].knn_method = cfg.knn_method
            name = 'train_name' if data == 'train_data' else 'test_name'
            cfg[data].name = cfg[name]

    logger = create_logger()

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}'.format(args.seed))
        set_random_seed(args.seed)

    model = build_model(cfg.model['type'], **cfg.model['kwargs'])
    handler = build_handler(args.phase, cfg.model['type'])

    handler(model, cfg, logger)
Exemple #2
0
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)
    # set cuda
    cfg.cuda = not args.no_cuda and torch.cuda.is_available()
    # set cudnn_benchmark & cudnn_deterministic
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    if cfg.get('cudnn_deterministic', False):
        torch.backends.cudnn.deterministic = True
    # update configs according to args
    if not hasattr(cfg, 'work_dir'):
        if args.work_dir is not None:
            cfg.work_dir = args.work_dir
        else:
            cfg_name = rm_suffix(os.path.basename(args.config))
            cfg.work_dir = os.path.join('./data/work_dir', cfg_name)
    mkdir_if_no_exists(cfg.work_dir, is_folder=True)
    if not hasattr(cfg, 'stage'):
        cfg.stage = args.stage

    cfg.load_from1 = args.load_from1
    cfg.load_from2 = args.load_from2
    cfg.load_from3 = args.load_from3
    cfg.resume_from = args.resume_from

    #cfg.gpus = args.gpus
    cfg.distributed = args.distributed
    cfg.save_output = args.save_output
    cfg.phase = args.phase
    logger = create_logger()

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}'.format(args.seed))
        set_random_seed(args.seed)

    model = [build_model(cfg.model1['type'], **cfg.model1['kwargs']), \
            build_model(cfg.model2['type'], **cfg.model2['kwargs']), \
            build_model(cfg.model3['type'], **cfg.model3['kwargs'])]
    if cfg.phase == 'train':
        if cfg.load_from1:
            model1, model2, model3 = model[0], model[1], model[2]
            model1.load_state_dict(torch.load(cfg.load_from1))
            model[0] = model1
        if cfg.load_from2:
            model2.load_state_dict(torch.load(cfg.load_from2))
            model[1] = model2
        if cfg.load_from3:
            model3.load_state_dict(torch.load(cfg.load_from3))
            model[2] = model3
    handler = build_handler(args.phase, args.stage)

    handler(model, cfg, logger)
def main():
    args = parse_args()
    cfg = Config.fromfile(args.config)

    # set cuda
    cfg.cuda = not args.no_cuda and torch.cuda.is_available()

    # set cudnn_benchmark & cudnn_deterministic
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True
    if cfg.get('cudnn_deterministic', False):
        torch.backends.cudnn.deterministic = True

    # update configs according to args
    if not hasattr(cfg, 'work_dir'):
        if args.work_dir is not None:
            cfg.work_dir = args.work_dir
        else:
            cfg_name = rm_suffix(os.path.basename(args.config))
            cfg.work_dir = os.path.join('./data/work_dir', cfg_name)
    mkdir_if_no_exists(cfg.work_dir, is_folder=True)
    if not hasattr(cfg, 'stage'):
        cfg.stage = args.stage

    if not hasattr(cfg, 'test_batch_size_per_gpu'):
        cfg.test_batch_size_per_gpu = cfg.batch_size_per_gpu

    cfg.load_from = args.load_from
    cfg.resume_from = args.resume_from

    cfg.pred_iou_score = args.pred_iou_score
    cfg.pred_iop_score = args.pred_iop_score

    cfg.gpus = args.gpus
    cfg.det_label = args.det_label
    cfg.distributed = args.distributed
    cfg.save_output = args.save_output

    logger = create_logger()

    # set random seeds
    if args.seed is not None:
        logger.info('Set random seed to {}'.format(args.seed))
        set_random_seed(args.seed)

    model = build_model(cfg.model['type'], **cfg.model['kwargs'])
    handler = build_handler(args.phase, args.stage)

    handler(model, cfg, logger)
Exemple #4
0
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)

        self.ignore_ratio = cfg.get('ignore_ratio', 0.8)
        self.ignore_small_confs = cfg.get('ignore_small_confs', True)
        self.use_candidate_set = cfg.get('use_candidate_set', True)

        self.nproc = cfg.get('nproc', 1)
        self.max_qsize = cfg.get('max_qsize', int(1e5))

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = self.inst_num
            assert self.size == self.features.shape[0]

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        with Timer('read knn graph'):
            if knn_graph_path is not None:
                knns = np.load(knn_graph_path)['data']
            else:
                prefix = osp.dirname(feat_path)
                name = rm_suffix(osp.basename(feat_path))
                # find root folder of `features`
                prefix = osp.dirname(prefix)
                knn_prefix = osp.join(prefix, 'knns', name)
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn)
            assert self.inst_num == len(knns), "{} vs {}".format(
                self.inst_num, len(knns))

            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            self.adj = row_normalize(adj)

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns, sort=True)

            if cfg.pred_confs != '':
                print('read estimated confidence from {}'.format(
                    cfg.pred_confs))
                self.confs = np.load(cfg.pred_confs)['pred_confs']
            else:
                print('use unsupervised density as confidence')
                assert self.radius
                from vegcn.confidence import density
                self.confs = density(self.dists, radius=self.radius)

            assert 0 <= self.ignore_ratio <= 1
            if self.ignore_ratio == 1:
                self.ignore_set = set(np.arange(len(self.confs)))
            else:
                num = int(len(self.confs) * self.ignore_ratio)
                confs = self.confs
                if not self.ignore_small_confs:
                    confs = -confs
                self.ignore_set = set(np.argpartition(confs, num)[:num])

        print(
            'ignore_ratio: {}, ignore_small_confs: {}, use_candidate_set: {}'.
            format(self.ignore_ratio, self.ignore_small_confs,
                   self.use_candidate_set))
        print('#ignore_set: {} / {} = {:.3f}'.format(
            len(self.ignore_set), self.inst_num,
            1. * len(self.ignore_set) / self.inst_num))

        with Timer('Prepare sub-graphs'):
            # construct subgraphs with larger confidence
            self.peaks = {i: [] for i in range(self.inst_num)}
            self.dist2peak = {i: [] for i in range(self.inst_num)}

            if self.nproc > 1:
                # multi-process
                import multiprocessing as mp
                pool = mp.Pool(self.nproc)
                results = []
                num = int(self.inst_num / self.max_qsize) + 1
                for i in tqdm(range(num)):
                    beg = int(i * self.max_qsize)
                    end = min(beg + self.max_qsize, self.inst_num)
                    lst = [j for j in range(beg, end)]
                    results.extend(
                        list(
                            tqdm(pool.map(self.get_subgraph, lst),
                                 total=len(lst))))
                pool.close()
                pool.join()
            else:
                results = [
                    self.get_subgraph(i) for i in tqdm(range(self.inst_num))
                ]

            self.adj_lst = []
            self.feat_lst = []
            self.lb_lst = []
            self.subset_gt_labels = []
            self.subset_idxs = []
            self.subset_nbrs = []
            self.subset_dists = []
            for result in results:
                if result is None:
                    continue
                elif len(result) == 3:
                    i, nbr, dist = result
                    self.peaks[i].extend(nbr)
                    self.dist2peak[i].extend(dist)
                    continue
                i, nbr, dist, feat, adj, lb = result
                self.subset_idxs.append(i)
                self.subset_nbrs.append(nbr)
                self.subset_dists.append(dist)
                self.feat_lst.append(feat)
                self.adj_lst.append(adj)
                if not self.ignore_label:
                    self.subset_gt_labels.append(self.idx2lb[i])
                    self.lb_lst.append(lb)
            self.subset_gt_labels = np.array(self.subset_gt_labels)

            self.size = len(self.feat_lst)
            assert self.size == len(self.adj_lst)
            if not self.ignore_label:
                assert self.size == len(self.lb_lst)
test_name = 'part1_test'
knn = 160
knn_method = 'faiss'
th_sim = 0.  # cut edges with similarity smaller than th_sim

# testing args
max_conn = 1
tau = 0.8

metrics = ['pairwise', 'bcubed', 'nmi']

# gcn_v configs
_work_dir = 'work_dir'
ckpt_name = 'latest'  # epoch_80000
gcnv_cfg = './vegcn/configs/cfg_train_gcnv_ms1m.py'
gcnv_cfg_name = rm_suffix(osp.basename(gcnv_cfg))
gcnv_cfg = Config.fromfile(gcnv_cfg)
gcnv_cfg.load_from = '{}/{}/{}/{}.pth'.format(prefix, _work_dir, gcnv_cfg_name,
                                              ckpt_name)

use_gcn_feat = True
feat_paths = []
pred_conf_paths = []
gcnv_nhid = gcnv_cfg.model.kwargs.nhid
for name in [train_name, test_name]:
    gcnv_prefix = '{}/{}/{}/{}_gcnv_k_{}_th_{}'.format(prefix, _work_dir,
                                                       gcnv_cfg_name, name,
                                                       gcnv_cfg.knn,
                                                       gcnv_cfg.th_sim)
    feat_paths.append(
        osp.join(gcnv_prefix, 'features', '{}.bin'.format(ckpt_name)))
    def __init__(self, cfg):
        feat_path = cfg['feat_path']
        label_path = cfg.get('label_path', None)
        knn_graph_path = cfg.get('knn_graph_path', None)

        self.k = cfg['k']
        self.feature_dim = cfg['feature_dim']
        self.is_norm_feat = cfg.get('is_norm_feat', True)
        self.save_decomposed_adj = cfg.get('save_decomposed_adj', False)

        self.th_sim = cfg.get('th_sim', 0.)
        self.max_conn = cfg.get('max_conn', 1)
        self.conf_metric = cfg.get('conf_metric')

        with Timer('read meta and feature'):
            if label_path is not None:
                self.lb2idxs, self.idx2lb = read_meta(label_path)
                self.inst_num = len(self.idx2lb)
                self.gt_labels = intdict2ndarray(self.idx2lb)
                self.ignore_label = False
            else:
                self.inst_num = -1
                self.ignore_label = True
            self.features = read_probs(feat_path, self.inst_num,
                                       self.feature_dim)
            if self.is_norm_feat:
                self.features = l2norm(self.features)
            if self.inst_num == -1:
                self.inst_num = self.features.shape[0]
            self.size = 1 # take the entire graph as input

        with Timer('read knn graph'):
            if os.path.isfile(knn_graph_path):
                knns = np.load(knn_graph_path)['data']
            else:
                if knn_graph_path is not None:
                    print('knn_graph_path does not exist: {}'.format(
                        knn_graph_path))
                
                prefix = osp.dirname(feat_path)
                name = rm_suffix(osp.basename(feat_path))
                # find root folder of `features`
                prefix = osp.dirname(prefix)
                knn_prefix = osp.join(prefix, 'knns', name)
                knns = build_knns(knn_prefix, self.features, cfg.knn_method,
                                  cfg.knn)

            adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True)

            # build symmetric adjacency matrix
            adj = build_symmetric_adj(adj, self_loop=True)
            adj = row_normalize(adj)
            if self.save_decomposed_adj:
                adj = sparse_mx_to_indices_values(adj)
                self.adj_indices, self.adj_values, self.adj_shape = adj
            else:
                self.adj = adj

            # convert knns to (dists, nbrs)
            self.dists, self.nbrs = knns2ordered_nbrs(knns)

        print('feature shape: {}, k: {}, norm_feat: {}'.format(
            self.features.shape, self.k, self.is_norm_feat))

        if not self.ignore_label:
            with Timer('Prepare ground-truth label'):
                self.labels = confidence(feats=self.features,
                                         dists=self.dists,
                                         nbrs=self.nbrs,
                                         metric=self.conf_metric,
                                         idx2lb=self.idx2lb,
                                         lb2idxs=self.lb2idxs)
                if cfg.eval_interim:
                    _, self.peaks = confidence_to_peaks(
                        self.dists, self.nbrs, self.labels, self.max_conn)
def test_gcn_v(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim)
    oprefix = osp.join(cfg.work_dir, folder)
    oname = osp.basename(rm_suffix(cfg.load_from))
    opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname))

    if osp.isfile(opath_pred_confs) and not cfg.force:
        data = np.load(opath_pred_confs)
        pred_confs = data['pred_confs']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(opath_pred_confs, inst_num, len(dataset)))
    else:
        pred_confs, gcn_feat = test(model, dataset, cfg, logger)
        inst_num = dataset.inst_num

    logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format(
        pred_confs.mean(), pred_confs.max(), pred_confs.min()))

    logger.info('Convert to cluster')
    with Timer('Predition to peaks'):
        pred_dist2peak, pred_peaks = confidence_to_peaks(
            dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn)

    if not dataset.ignore_label and cfg.eval_interim:
        # evaluate the intermediate results
        for i in range(cfg.max_conn):
            num = len(dataset.peaks)
            pred_peaks_i = np.arange(num)
            peaks_i = np.arange(num)
            for j in range(num):
                if len(pred_peaks[j]) > i:
                    pred_peaks_i[j] = pred_peaks[j][i]
                if len(dataset.peaks[j]) > i:
                    peaks_i[j] = dataset.peaks[j][i]
            acc = accuracy(pred_peaks_i, peaks_i)
            logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format(
                i + 1, acc))
            acc = 0.
            for idx, peak in enumerate(pred_peaks_i):
                acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx])
            acc /= len(pred_peaks_i)
            logger.info(
                '[{}-th conn] accuracy of peak label match: {:.4f}'.format(
                    i + 1, acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0,
                                      inst_num)

    if cfg.save_output:
        logger.info('save predicted confs to {}'.format(opath_pred_confs))
        mkdir_if_no_exists(opath_pred_confs)
        np.savez_compressed(opath_pred_confs,
                            pred_confs=pred_confs,
                            inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        opath_pred_labels = osp.join(
            cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0))
        logger.info('save predicted labels to {}'.format(opath_pred_labels))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

    if cfg.use_gcn_feat:
        # gcn_feat is saved to disk for GCN-E
        opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname))
        if not osp.isfile(opath_feat) or cfg.force:
            mkdir_if_no_exists(opath_feat)
            write_feat(opath_feat, gcn_feat)

        name = rm_suffix(osp.basename(opath_feat))
        prefix = oprefix
        ds = BasicDataset(name=name,
                          prefix=prefix,
                          dim=cfg.model['kwargs']['nhid'],
                          normalize=True)
        ds.info()

        # use top embedding of GCN to rebuild the kNN graph
        with Timer('connect to higher confidence with use_gcn_feat'):
            knn_prefix = osp.join(prefix, 'knns', name)
            knns = build_knns(knn_prefix,
                              ds.features,
                              cfg.knn_method,
                              cfg.knn,
                              is_rebuild=True)
            dists, nbrs = knns2ordered_nbrs(knns)

            pred_dist2peak, pred_peaks = confidence_to_peaks(
                dists, nbrs, pred_confs, cfg.max_conn)
            pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                          inst_num)

        # save clustering results
        if cfg.save_output:
            oname_meta = '{}_gcn_feat'.format(name)
            opath_pred_labels = osp.join(
                oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau))
            mkdir_if_no_exists(opath_pred_labels)

            idx2lb = list2dict(pred_labels, ignore_value=-1)
            write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

        # evaluation

        if not dataset.ignore_label:
            print('==> evaluation')
            for metric in cfg.metrics:
                evaluate(dataset.gt_labels, pred_labels, metric)
        import json
        import os
        import pdb
        pdb.set_trace()
        img_labels = json.load(
            open(r'/home/finn/research/data/clustering_data/test_index.json',
                 'r',
                 encoding='utf-8'))
        import shutil
        output = r'/home/finn/research/data/clustering_data/mr_gcn_output'
        for label in set(pred_labels):
            if not os.path.exists(os.path.join(output, f'cluter_{label}')):
                os.mkdir(os.path.join(output, f'cluter_{label}'))
        for image in img_labels:
            shutil.copy2(
                image,
                os.path.join(
                    os.path.join(output,
                                 f'cluter_{pred_labels[img_labels[image]]}'),
                    os.path.split(image)[-1]))