def test_lgcn(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.test_data)

    ofn_pred = os.path.join(cfg.work_dir, 'pred_edges_scores.npz')
    if os.path.isfile(ofn_pred) and not cfg.force:
        data = np.load(ofn_pred)
        edges = data['edges']
        scores = data['scores']
        inst_num = data['inst_num']
        if inst_num != len(dataset):
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(ofn_pred, inst_num, len(dataset)))
    else:
        edges, scores, inst_num = test(model, dataset, cfg, logger)

    # produce predicted labels
    clusters = graph_clustering_dynamic_th(edges,
                                           scores,
                                           max_sz=cfg.max_sz,
                                           step=cfg.step,
                                           pool=cfg.pool)
    pred_idx2lb = clusters2labels(clusters)
    pred_labels = intdict2ndarray(pred_idx2lb)

    if cfg.save_output:
        print('save predicted edges and scores to {}'.format(ofn_pred))
        np.savez_compressed(ofn_pred,
                            edges=edges,
                            scores=scores,
                            inst_num=inst_num)
        ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt')
        write_meta(ofn_meta, pred_idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        gt_labels = dataset.labels
        for metric in cfg.metrics:
            evaluate(gt_labels, pred_labels, metric)

        single_cluster_idxs = get_cluster_idxs(clusters, size=1)
        print('==> evaluation (removing {} single clusters)'.format(
            len(single_cluster_idxs)))
        remain_idxs = np.setdiff1d(np.arange(len(dataset)),
                                   np.array(single_cluster_idxs))
        remain_idxs = np.array(remain_idxs)
        for metric in cfg.metrics:
            evaluate(gt_labels[remain_idxs], pred_labels[remain_idxs], metric)
def generate_basic_proposals(oprefix,
                             knn_prefix,
                             feats,
                             feat_dim=256,
                             knn_method='faiss',
                             k=80,
                             th_knn=0.6,
                             th_step=0.05,
                             minsz=3,
                             maxsz=300,
                             is_rebuild=False,
                             is_save_proposals=True,
                             force=False,
                             **kwargs):
    print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format(
        k, th_knn, th_step, maxsz, is_rebuild))

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = osp.join(
        oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format(
            knn_method, k, th_knn, th_step, minsz, maxsz))
    ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt')
    if not osp.exists(ofolder):
        os.makedirs(ofolder)
    if not osp.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, maxsz)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz)

    # output cluster proposals
    ofolder_proposals = osp.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not osp.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force)

    return ofolder_proposals, ofn_pred_labels
def deoverlap(scores,
              proposals,
              tot_inst_num,
              th_pos=-1,
              th_iou=1,
              pred_label_fn=None,
              outlier_scores=None,
              th_outlier=0.5,
              keep_outlier=False):
    print('avg_score(mean: {:.2f}, max: {:.2f}, min: {:.2f})'.format(
        scores.mean(), scores.max(), scores.min()))

    assert len(proposals) == len(scores), '{} vs {}'.format(
        len(proposals), len(scores))
    assert (outlier_scores is None) or isinstance(outlier_scores, dict)

    pos_lst = []
    for idx, prob in enumerate(scores):
        if prob < th_pos:
            continue
        pos_lst.append([idx, prob])
    pos_lst = sorted(pos_lst, key=lambda x: x[1], reverse=True)

    # get all clusters
    clusters = []
    if keep_outlier:
        o_clusters = []
    for idx, _ in tqdm(pos_lst):
        fn_node = proposals[idx]
        cluster = load_data(fn_node)
        cluster, o_cluster = filter_outlier(cluster, fn_node, outlier_scores,
                                            th_outlier)
        clusters.append(cluster)
        if keep_outlier and len(o_cluster) > 0:
            o_clusters.append(o_cluster)

    if keep_outlier:
        print('#outlier_clusters: {}'.format(len(o_clusters)))
        clusters.extend(o_clusters)

    idx2lb, idx2lbs = nms(clusters, th_iou)

    # output stats
    multi_lb_num = 0
    for _, lbs in idx2lbs.items():
        if len(lbs) > 1:
            multi_lb_num += 1
    inst_num = len(idx2lb)
    cls_num = len(set(idx2lb.values()))

    print('#inst: {}, #class: {}, #multi-label: {}'.format(
        inst_num, cls_num, multi_lb_num))
    print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num))

    # save to file
    pred_labels = write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num)

    return pred_labels
def generate_proposals(oprefix,
                       knn_prefix,
                       feats,
                       feat_dim=256,
                       knn_method='faiss',
                       k=80,
                       th_knn=0.6,
                       th_step=0.05,
                       min_size=3,
                       max_size=300,
                       is_rebuild=False,
                       is_save_proposals=False):
    print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\
            format(k, th_knn, th_step, max_size, is_rebuild))

    # build knns
    # each node and it's top k nearest nodes also distancess
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(oprefix,
            '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\
            format(knn_method, k, th_knn, th_step, min_size, max_size))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, max_size)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, min_size)

    # output cluster proposals
    if is_save_proposals:
        ofolder = os.path.join(ofolder, 'proposals')
        print('saving cluster proposals to {}'.format(ofolder))
        if not os.path.exists(ofolder):
            os.makedirs(ofolder)
        save_proposals(clusters, knns, ofolder=ofolder, force=True)
Beispiel #5
0
def deoverlap(probs,
              proposals,
              tot_inst_num,
              th_pos=-1,
              th_iou=1,
              pred_label_fn=None):
    print('avg_score(mean: {:.2f}, max: {:.2f}, min: {:.2f})'.format(
        probs.mean(), probs.max(), probs.min()))

    assert len(proposals) == len(probs), '{} vs {}'.format(
        len(proposals), len(probs))

    pos_lst = []
    for idx, prob in enumerate(probs):
        if prob < th_pos:
            continue
        pos_lst.append([idx, prob])
    pos_lst = sorted(pos_lst, key=lambda x: x[1], reverse=True)

    # get all clusters
    clusters = []
    for idx, _ in tqdm(pos_lst):
        cluster = load_data(proposals[idx])
        clusters.append(cluster)

    idx2lb, idx2lbs = nms(clusters, th_iou)

    # output stats
    multi_lb_num = 0
    for _, lbs in idx2lbs.items():
        if len(lbs) > 1:
            multi_lb_num += 1
    inst_num = len(idx2lb)
    cls_num = len(set(idx2lb.values()))

    print('#inst: {}, #class: {}, #multi-label: {}'.format(
        inst_num, cls_num, multi_lb_num))
    print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num))

    # save to file
    pred_labels = write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num)

    return pred_labels
def test_cluster_seg(model, cfg, logger):
    assert osp.isfile(cfg.pred_iou_score)

    if cfg.load_from:
        logger.info('load pretrained model from: {}'.format(cfg.load_from))
        load_checkpoint(model, cfg.load_from, strict=True, logger=logger)

    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)

    setattr(cfg.test_data, 'pred_iop_score', cfg.pred_iop_score)

    dataset = build_dataset(cfg.test_data)
    processor = build_processor(cfg.stage)

    inst_num = dataset.inst_num

    # read pred_scores from file and do sanity check
    d = np.load(cfg.pred_iou_score, allow_pickle=True)
    pred_scores = d['data']
    meta = d['meta'].item()
    assert inst_num == meta['tot_inst_num'], '{} vs {}'.format(
        inst_num, meta['tot_inst_num'])

    proposals = [fn_node for fn_node, _ in dataset.tot_lst]
    _proposals = []
    fn_node_pattern = '*_node.npz'
    for proposal_folder in meta['proposal_folders']:
        fn_clusters = sorted(
            glob.glob(osp.join(proposal_folder, fn_node_pattern)))
        _proposals.extend([fn_node for fn_node in fn_clusters])
    assert proposals == _proposals, '{} vs {}'.format(len(proposals),
                                                      len(_proposals))

    losses = []
    pred_outlier_scores = []
    stats = {'mean': []}

    if cfg.gpus == 1:
        data_loader = build_dataloader(dataset,
                                       processor,
                                       cfg.test_batch_size_per_gpu,
                                       cfg.workers_per_gpu,
                                       train=False)

        model = MMDataParallel(model, device_ids=range(cfg.gpus))
        if cfg.cuda:
            model.cuda()

        model.eval()
        for i, data in enumerate(data_loader):
            with torch.no_grad():
                output, loss = model(data, return_loss=True)
                losses += [loss.item()]
                if i % cfg.log_config.interval == 0:
                    if dataset.ignore_label:
                        logger.info('[Test] Iter {}/{}'.format(
                            i, len(data_loader)))
                    else:
                        logger.info('[Test] Iter {}/{}: Loss {:.4f}'.format(
                            i, len(data_loader), loss))
                if cfg.save_output:
                    output = F.softmax(output, dim=1)
                    output = output[:, 1, :]
                    scores = output.data.cpu().numpy()
                    pred_outlier_scores.extend(list(scores))
                    stats['mean'] += [scores.mean()]
    else:
        raise NotImplementedError

    if not dataset.ignore_label:
        avg_loss = sum(losses) / len(losses)
        logger.info('[Test] Overall Loss {:.4f}'.format(avg_loss))

    scores_mean = 1. * sum(stats['mean']) / len(stats['mean'])
    logger.info('mean of pred_outlier_scores: {:.4f}'.format(scores_mean))

    # save predicted scores
    if cfg.save_output:
        if cfg.load_from:
            fn = osp.basename(cfg.load_from)
        else:
            fn = 'random'
        opath = osp.join(cfg.work_dir, fn[:fn.rfind('.pth')] + '.npz')
        meta = {
            'tot_inst_num': inst_num,
            'proposal_folders': cfg.test_data.proposal_folders,
        }
        logger.info('dump pred_outlier_scores ({}) to {}'.format(
            len(pred_outlier_scores), opath))
        np.savez_compressed(opath, data=pred_outlier_scores, meta=meta)

    # post-process
    outlier_scores = {
        fn_node: outlier_score
        for (fn_node,
             _), outlier_score in zip(dataset.lst, pred_outlier_scores)
    }

    # de-overlap (w gcn-s)
    pred_labels_w_seg = deoverlap(pred_scores,
                                  proposals,
                                  inst_num,
                                  cfg.th_pos,
                                  cfg.th_iou,
                                  outlier_scores=outlier_scores,
                                  th_outlier=cfg.th_outlier,
                                  keep_outlier=cfg.keep_outlier)

    # de-overlap (wo gcn-s)
    pred_labels_wo_seg = deoverlap(pred_scores, proposals, inst_num,
                                   cfg.th_pos, cfg.th_iou)

    # save predicted labels
    if cfg.save_output:
        ofn_meta_w_seg = osp.join(cfg.work_dir, 'pred_labels_w_seg.txt')
        ofn_meta_wo_seg = osp.join(cfg.work_dir, 'pred_labels_wo_seg.txt')
        print('save predicted labels to {} and {}'.format(
            ofn_meta_w_seg, ofn_meta_wo_seg))
        pred_idx2lb_w_seg = list2dict(pred_labels_w_seg, ignore_value=-1)
        pred_idx2lb_wo_seg = list2dict(pred_labels_wo_seg, ignore_value=-1)
        write_meta(ofn_meta_w_seg, pred_idx2lb_w_seg, inst_num=inst_num)
        write_meta(ofn_meta_wo_seg, pred_idx2lb_wo_seg, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        gt_labels = dataset.labels
        print('==> evaluation (with gcn-s)')
        for metric in cfg.metrics:
            evaluate(gt_labels, pred_labels_w_seg, metric)
        print('==> evaluation (without gcn-s)')
        for metric in cfg.metrics:
            evaluate(gt_labels, pred_labels_wo_seg, metric)
Beispiel #7
0
def test_gcn_e(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    pred_peaks = dataset.peaks
    pred_dist2peak = dataset.dist2peak

    ofn_pred = osp.join(cfg.work_dir, 'pred_conns.npz')
    if osp.isfile(ofn_pred) and not cfg.force:
        data = np.load(ofn_pred)
        pred_conns = data['pred_conns']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(ofn_pred, inst_num, len(dataset)))
    else:
        if cfg.random_conns:
            pred_conns = []
            for nbr, dist, idx in zip(dataset.subset_nbrs,
                                      dataset.subset_dists,
                                      dataset.subset_idxs):
                for _ in range(cfg.max_conn):
                    pred_rel_nbr = np.random.choice(np.arange(len(nbr)))
                    pred_abs_nbr = nbr[pred_rel_nbr]
                    pred_peaks[idx].append(pred_abs_nbr)
                    pred_dist2peak[idx].append(dist[pred_rel_nbr])
                    pred_conns.append(pred_rel_nbr)
            pred_conns = np.array(pred_conns)
        else:
            pred_conns = test(model, dataset, cfg, logger)
            for pred_rel_nbr, nbr, dist, idx in zip(pred_conns,
                                                    dataset.subset_nbrs,
                                                    dataset.subset_dists,
                                                    dataset.subset_idxs):
                pred_abs_nbr = nbr[pred_rel_nbr]
                pred_peaks[idx].extend(pred_abs_nbr)
                pred_dist2peak[idx].extend(dist[pred_rel_nbr])
        inst_num = dataset.inst_num

    if len(pred_conns) > 0:
        logger.info(
            'pred_conns (nbr order): mean({:.1f}), max({}), min({})'.format(
                pred_conns.mean(), pred_conns.max(), pred_conns.min()))

    if not dataset.ignore_label and cfg.eval_interim:
        subset_gt_labels = dataset.subset_gt_labels
        for i in range(cfg.max_conn):
            pred_peaks_labels = np.array([
                dataset.idx2lb[pred_peaks[idx][i]]
                for idx in dataset.subset_idxs
            ])

            acc = accuracy(pred_peaks_labels, subset_gt_labels)
            logger.info(
                '[{}-th] accuracy of pred_peaks labels ({}): {:.4f}'.format(
                    i, len(pred_peaks_labels), acc))

            # the rule for nearest nbr is only appropriate when nbrs is sorted
            nearest_idxs = np.where(pred_conns[:, i] == 0)[0]
            acc = accuracy(pred_peaks_labels[nearest_idxs],
                           subset_gt_labels[nearest_idxs])
            logger.info(
                '[{}-th] accuracy of pred labels (nearest: {}): {:.4f}'.format(
                    i, len(nearest_idxs), acc))

            not_nearest_idxs = np.where(pred_conns[:, i] > 0)[0]
            acc = accuracy(pred_peaks_labels[not_nearest_idxs],
                           subset_gt_labels[not_nearest_idxs])
            logger.info(
                '[{}-th] accuracy of pred labels (not nearest: {}): {:.4f}'.
                format(i, len(not_nearest_idxs), acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                      inst_num)

    if cfg.save_output:
        logger.info(
            'save predicted connectivity and labels to {}'.format(ofn_pred))
        if not osp.isfile(ofn_pred) or cfg.force:
            np.savez_compressed(ofn_pred,
                                pred_conns=pred_conns,
                                inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        folder = '{}_gcne_k_{}_th_{}_ig_{}'.format(cfg.test_name, cfg.knn,
                                                   cfg.th_sim,
                                                   cfg.test_data.ignore_ratio)
        opath_pred_labels = osp.join(cfg.work_dir, folder,
                                     'tau_{}_pred_labels.txt'.format(cfg.tau))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

        # H and C-scores
        gt_dict = {}
        pred_dict = {}
        for i in range(len(dataset.gt_labels)):
            gt_dict[str(i)] = dataset.gt_labels[i]
            pred_dict[str(i)] = pred_labels[i]
        bm = ClusteringBenchmark(gt_dict)
        scores = bm.evaluate_vmeasure(pred_dict)
        # fmi_scores = bm.evaluate_fowlkes_mallows_score(pred_dict)
        print(scores)
Beispiel #8
0
    return opath


if __name__ == '__main__':
    args = parse_args()

    cluster_func = baseline.__dict__[args.method]

    ds = BasicDataset(name=args.name,
                      prefix=args.prefix,
                      dim=args.dim,
                      normalize=not args.no_normalize)
    ds.info()
    feats = ds.features

    opath = get_output_path(args)

    with Timer('{}'.format(args.method)):
        pred_labels = cluster_func(feats, **args.__dict__)

    # save clustering results
    idx2lb = {}
    for idx, lb in enumerate(pred_labels):
        if lb == -1:
            continue
        idx2lb[idx] = lb
    inst_num = len(pred_labels)
    print('coverage: {} / {} = {:.4f}'.format(len(idx2lb), inst_num,
                                              1. * len(idx2lb) / inst_num))
    write_meta(opath, idx2lb, inst_num=inst_num)
            idxs = lb2idxs[lb]
            iou = compute_iou(cluster, idxs)
        ious.append(iou)
    ious = np.array(ious)

    # rank by iou
    pos_g_labels = np.where(ious > args.th_pos)[0]
    clusters = [[clusters[i], ious[i]] for i in pos_g_labels]
    clusters = sorted(clusters, key=lambda x: x[1], reverse=True)
    clusters = [n for n, _ in clusters]

    inst_num = len(idx2lb)
    pos_idx_set = set()
    for c in clusters:
        pos_idx_set |= set(c)
    print('inst-coverage before nms: {}'.format(1. * len(pos_idx_set) /
                                                inst_num))

    # nms
    idx2lb, _ = nms(clusters, args.th_iou)

    # output stats
    inst_num = len(idx2lb)
    cls_num = len(idx2lb.values())

    print('#inst: {}, #class: {}'.format(inst_num, cls_num))
    print('#inst-coverage: {:.2f}'.format(1. * inst_num / tot_inst_num))

    # save to file
    write_meta(pred_label_fn, idx2lb, inst_num=tot_inst_num)
def test_cluster_det(model, cfg, logger):
    if cfg.load_from:
        logger.info('load pretrained model from: {}'.format(cfg.load_from))
        load_checkpoint(model, cfg.load_from, strict=True, logger=logger)

    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.test_data)
    processor = build_processor(cfg.stage)

    losses = []
    pred_scores = []

    if cfg.gpus == 1:
        data_loader = build_dataloader(dataset,
                                       processor,
                                       cfg.test_batch_size_per_gpu,
                                       cfg.workers_per_gpu,
                                       train=False)

        model = MMDataParallel(model, device_ids=range(cfg.gpus))
        if cfg.cuda:
            model.cuda()

        model.eval()
        for i, data in enumerate(data_loader):
            with torch.no_grad():
                output, loss = model(data, return_loss=True)
                losses += [loss.item()]
                if i % cfg.log_config.interval == 0:
                    if dataset.ignore_label:
                        logger.info('[Test] Iter {}/{}'.format(
                            i, len(data_loader)))
                    else:
                        logger.info('[Test] Iter {}/{}: Loss {:.4f}'.format(
                            i, len(data_loader), loss))
                if cfg.save_output:
                    output = output.view(-1)
                    prob = output.data.cpu().numpy()
                    pred_scores.append(prob)
    else:
        raise NotImplementedError

    if not dataset.ignore_label:
        avg_loss = sum(losses) / len(losses)
        logger.info('[Test] Overall Loss {:.4f}'.format(avg_loss))

    # save predicted scores
    if cfg.save_output:
        if cfg.load_from:
            fn = os.path.basename(cfg.load_from)
        else:
            fn = 'random'
        opath = os.path.join(cfg.work_dir, fn[:fn.rfind('.pth')] + '.npz')
        meta = {
            'tot_inst_num': dataset.inst_num,
            'proposal_folders': cfg.test_data.proposal_folders,
        }
        print('dump pred_score to {}'.format(opath))
        pred_scores = np.concatenate(pred_scores).ravel()
        np.savez_compressed(opath, data=pred_scores, meta=meta)

    # de-overlap
    proposals = [fn_node for fn_node, _ in dataset.lst]
    pred_labels = deoverlap(pred_scores, proposals, dataset.inst_num,
                            cfg.th_pos, cfg.th_iou)

    # save predicted labels
    if cfg.save_output:
        ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt')
        print('save predicted labels to {}'.format(ofn_meta))
        pred_idx2lb = list2dict(pred_labels, ignore_value=-1)
        write_meta(ofn_meta, pred_idx2lb, inst_num=dataset.inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        gt_labels = dataset.labels
        for metric in cfg.metrics:
            evaluate(gt_labels, pred_labels, metric)
Beispiel #11
0
def generate_iter_proposals(oprefix,
                            knn_prefix,
                            feats,
                            feat_dim=256,
                            knn_method='faiss',
                            k=80,
                            th_knn=0.6,
                            th_step=0.05,
                            minsz=3,
                            maxsz=300,
                            sv_minsz=2,
                            sv_maxsz=5,
                            sv_labels=None,
                            sv_knn_prefix=None,
                            is_rebuild=False,
                            is_save_proposals=True,
                            force=False,
                            **kwargs):

    assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals"
    print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, '
          'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format(
              k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz,
              is_rebuild))

    if not os.path.exists(sv_labels):
        raise FileNotFoundError('{} not found.'.format(sv_labels))

    if sv_knn_prefix is None:
        sv_knn_prefix = knn_prefix

    # get iter and knns from super vertex path
    _iter = get_iter_from_path(sv_labels) + 1
    knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats)
    print('read sv_clusters from {}'.format(sv_labels))
    sv_lb2idxs, sv_idx2lb = read_meta(sv_labels)
    inst_num = len(sv_idx2lb)
    sv_clusters = labels2clusters(sv_lb2idxs)
    # sv_clusters = filter_clusters(sv_clusters, minsz)
    feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters])
    print('average feature of super vertices:', feats.shape)

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(
        oprefix,
        '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'.
        format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz,
               sv_maxsz, _iter))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices (iter={})'.format(_iter)):
            clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz)
            clusters = filter_clusters(clusters, sv_minsz)
            clusters = [[x for c in cluster for x in sv_clusters[c]]
                        for cluster in clusters]
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels, inst_num=inst_num)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz, maxsz)

    # output cluster proposals
    ofolder_proposals = os.path.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not os.path.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters,
                       knns_inst,
                       ofolder=ofolder_proposals,
                       force=force)

    return ofolder_proposals, ofn_pred_labels
def test_gcn_v(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.model['type'], cfg.test_data)

    folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim)
    oprefix = osp.join(cfg.work_dir, folder)
    oname = osp.basename(rm_suffix(cfg.load_from))
    opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname))

    if osp.isfile(opath_pred_confs) and not cfg.force:
        data = np.load(opath_pred_confs)
        pred_confs = data['pred_confs']
        inst_num = data['inst_num']
        if inst_num != dataset.inst_num:
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(opath_pred_confs, inst_num, len(dataset)))
    else:
        pred_confs, gcn_feat = test(model, dataset, cfg, logger)
        inst_num = dataset.inst_num

    logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format(
        pred_confs.mean(), pred_confs.max(), pred_confs.min()))

    logger.info('Convert to cluster')
    with Timer('Predition to peaks'):
        pred_dist2peak, pred_peaks = confidence_to_peaks(
            dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn)

    if not dataset.ignore_label and cfg.eval_interim:
        # evaluate the intermediate results
        for i in range(cfg.max_conn):
            num = len(dataset.peaks)
            pred_peaks_i = np.arange(num)
            peaks_i = np.arange(num)
            for j in range(num):
                if len(pred_peaks[j]) > i:
                    pred_peaks_i[j] = pred_peaks[j][i]
                if len(dataset.peaks[j]) > i:
                    peaks_i[j] = dataset.peaks[j][i]
            acc = accuracy(pred_peaks_i, peaks_i)
            logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format(
                i + 1, acc))
            acc = 0.
            for idx, peak in enumerate(pred_peaks_i):
                acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx])
            acc /= len(pred_peaks_i)
            logger.info(
                '[{}-th conn] accuracy of peak label match: {:.4f}'.format(
                    i + 1, acc))

    with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)):
        pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0,
                                      inst_num)

    if cfg.save_output:
        logger.info('save predicted confs to {}'.format(opath_pred_confs))
        mkdir_if_no_exists(opath_pred_confs)
        np.savez_compressed(opath_pred_confs,
                            pred_confs=pred_confs,
                            inst_num=inst_num)

        # save clustering results
        idx2lb = list2dict(pred_labels, ignore_value=-1)

        opath_pred_labels = osp.join(
            cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0))
        logger.info('save predicted labels to {}'.format(opath_pred_labels))
        mkdir_if_no_exists(opath_pred_labels)
        write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        for metric in cfg.metrics:
            evaluate(dataset.gt_labels, pred_labels, metric)

    if cfg.use_gcn_feat:
        # gcn_feat is saved to disk for GCN-E
        opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname))
        if not osp.isfile(opath_feat) or cfg.force:
            mkdir_if_no_exists(opath_feat)
            write_feat(opath_feat, gcn_feat)

        name = rm_suffix(osp.basename(opath_feat))
        prefix = oprefix
        ds = BasicDataset(name=name,
                          prefix=prefix,
                          dim=cfg.model['kwargs']['nhid'],
                          normalize=True)
        ds.info()

        # use top embedding of GCN to rebuild the kNN graph
        with Timer('connect to higher confidence with use_gcn_feat'):
            knn_prefix = osp.join(prefix, 'knns', name)
            knns = build_knns(knn_prefix,
                              ds.features,
                              cfg.knn_method,
                              cfg.knn,
                              is_rebuild=True)
            dists, nbrs = knns2ordered_nbrs(knns)

            pred_dist2peak, pred_peaks = confidence_to_peaks(
                dists, nbrs, pred_confs, cfg.max_conn)
            pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau,
                                          inst_num)

        # save clustering results
        if cfg.save_output:
            oname_meta = '{}_gcn_feat'.format(name)
            opath_pred_labels = osp.join(
                oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau))
            mkdir_if_no_exists(opath_pred_labels)

            idx2lb = list2dict(pred_labels, ignore_value=-1)
            write_meta(opath_pred_labels, idx2lb, inst_num=inst_num)

        # evaluation

        if not dataset.ignore_label:
            print('==> evaluation')
            for metric in cfg.metrics:
                evaluate(dataset.gt_labels, pred_labels, metric)
        import json
        import os
        import pdb
        pdb.set_trace()
        img_labels = json.load(
            open(r'/home/finn/research/data/clustering_data/test_index.json',
                 'r',
                 encoding='utf-8'))
        import shutil
        output = r'/home/finn/research/data/clustering_data/mr_gcn_output'
        for label in set(pred_labels):
            if not os.path.exists(os.path.join(output, f'cluter_{label}')):
                os.mkdir(os.path.join(output, f'cluter_{label}'))
        for image in img_labels:
            shutil.copy2(
                image,
                os.path.join(
                    os.path.join(output,
                                 f'cluter_{pred_labels[img_labels[image]]}'),
                    os.path.split(image)[-1]))
Beispiel #13
0
def generate_proposals(oprefix,
                       feats,
                       feat_dim=256,
                       knn_method='hnsw',
                       k=80,
                       th_knn=0.6,
                       th_step=0.05,
                       min_size=3,
                       max_size=300,
                       is_rebuild=False,
                       is_save_proposals=False):
    print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\
            format(k, th_knn, th_step, max_size, is_rebuild))

    ## knn retrieval
    oprefix = os.path.join(oprefix, '{}_k_{}'.format(knn_method, k))
    knn_fn = oprefix + '.npz'
    if not os.path.isfile(knn_fn) or is_rebuild:
        index_fn = oprefix + '.index'
        with Timer('build index'):
            if knn_method == 'hnsw':
                from proposals import knn_hnsw
                index = knn_hnsw(feats, k, index_fn)
            elif knn_method == 'faiss':
                from proposals import knn_faiss
                index = knn_faiss(feats, k, index_fn)
            else:
                raise KeyError('Unsupported method({}). \
                        Only support hnsw and faiss currently'.format(
                    knn_method))
            knns = index.get_knns()
        with Timer('dump knns to {}'.format(knn_fn)):
            dump_data(knn_fn, knns, force=True)
    else:
        print('read knn from {}'.format(knn_fn))
        knns = load_data(knn_fn)

    # obtain cluster proposals
    ofolder = oprefix + '_th_{}_step_{}_minsz_{}_maxsz_{}_iter0'.\
                format(th_knn, th_step, min_size, max_size)
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, max_size)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, min_size)

    # output cluster proposals
    if is_save_proposals:
        ofolder = os.path.join(ofolder, 'proposals')
        print('saving cluster proposals to {}'.format(ofolder))
        if not os.path.exists(ofolder):
            os.makedirs(ofolder)
        save_proposals(clusters, knns, ofolder=ofolder, force=True)