def test_lgcn(model, cfg, logger):
    for k, v in cfg.model['kwargs'].items():
        setattr(cfg.test_data, k, v)
    dataset = build_dataset(cfg.test_data)

    ofn_pred = os.path.join(cfg.work_dir, 'pred_edges_scores.npz')
    if os.path.isfile(ofn_pred) and not cfg.force:
        data = np.load(ofn_pred)
        edges = data['edges']
        scores = data['scores']
        inst_num = data['inst_num']
        if inst_num != len(dataset):
            logger.warn(
                'instance number in {} is different from dataset: {} vs {}'.
                format(ofn_pred, inst_num, len(dataset)))
    else:
        edges, scores, inst_num = test(model, dataset, cfg, logger)

    # produce predicted labels
    clusters = graph_clustering_dynamic_th(edges,
                                           scores,
                                           max_sz=cfg.max_sz,
                                           step=cfg.step,
                                           pool=cfg.pool)
    pred_idx2lb = clusters2labels(clusters)
    pred_labels = intdict2ndarray(pred_idx2lb)

    if cfg.save_output:
        print('save predicted edges and scores to {}'.format(ofn_pred))
        np.savez_compressed(ofn_pred,
                            edges=edges,
                            scores=scores,
                            inst_num=inst_num)
        ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt')
        write_meta(ofn_meta, pred_idx2lb, inst_num=inst_num)

    # evaluation
    if not dataset.ignore_label:
        print('==> evaluation')
        gt_labels = dataset.labels
        for metric in cfg.metrics:
            evaluate(gt_labels, pred_labels, metric)

        single_cluster_idxs = get_cluster_idxs(clusters, size=1)
        print('==> evaluation (removing {} single clusters)'.format(
            len(single_cluster_idxs)))
        remain_idxs = np.setdiff1d(np.arange(len(dataset)),
                                   np.array(single_cluster_idxs))
        remain_idxs = np.array(remain_idxs)
        for metric in cfg.metrics:
            evaluate(gt_labels[remain_idxs], pred_labels[remain_idxs], metric)
def generate_basic_proposals(oprefix,
                             knn_prefix,
                             feats,
                             feat_dim=256,
                             knn_method='faiss',
                             k=80,
                             th_knn=0.6,
                             th_step=0.05,
                             minsz=3,
                             maxsz=300,
                             is_rebuild=False,
                             is_save_proposals=True,
                             force=False,
                             **kwargs):
    print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format(
        k, th_knn, th_step, maxsz, is_rebuild))

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = osp.join(
        oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format(
            knn_method, k, th_knn, th_step, minsz, maxsz))
    ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt')
    if not osp.exists(ofolder):
        os.makedirs(ofolder)
    if not osp.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, maxsz)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz)

    # output cluster proposals
    ofolder_proposals = osp.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not osp.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force)

    return ofolder_proposals, ofn_pred_labels
Exemple #3
0
def chinese_whispers_fast(feats, prefix, name, knn_method, knn, th_sim, iters,
                          **kwargs):
    """ Chinese Whispers Clustering Algorithm

    Paper: Chinese whispers: an efficient graph clustering algorithm
            and its application to natural language processing problems.
    This implementation follows the matrix operation as described in Figure.4
    int the paper. We switch the `maxrow` and `D^{t-1} * A_G` to make it
    easier for post-processing.
    The current result is inferior to `chinese_whispers` as it lacks of the
    random mechanism as the iterative algorithm. The paper introduce two
    operations to tackle this issue, namely `random mutation` and `keep class`.
    However, it is not very clear how to set this two hyper-parameters.
    """
    assert len(feats) > 1

    with Timer('create graph'):
        knn_prefix = os.path.join(prefix, 'knns', name)
        knns = build_knns(knn_prefix, feats, knn_method, knn)
        dists, nbrs = knns2ordered_nbrs(knns, sort=True)
        spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True)
        A = build_symmetric_adj(spmat, self_loop=False)

        node_num = len(feats)
        edge_num = A.nnz
        print('#nodes: {}, #edges: {}'.format(node_num, edge_num))

    with Timer('whisper iteratively (iters={})'.format(iters)):
        D = identity(node_num)
        for _ in range(iters):
            D = D * A  # it is equal to D.dot(A)
            D = _maxrow(D, node_num)

        assert D.nnz == node_num

    clusters = {}
    assigned_clusters = D.tocoo().col
    for (node, assigned_cluster) in enumerate(assigned_clusters):
        if assigned_cluster not in clusters:
            clusters[assigned_cluster] = []
        clusters[assigned_cluster].append(node)

    print('#cluster: {}'.format(len(clusters)))
    labels = clusters2labels(clusters.values())
    labels = list(labels.values())

    return labels
def generate_proposals(oprefix,
                       knn_prefix,
                       feats,
                       feat_dim=256,
                       knn_method='faiss',
                       k=80,
                       th_knn=0.6,
                       th_step=0.05,
                       min_size=3,
                       max_size=300,
                       is_rebuild=False,
                       is_save_proposals=False):
    print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\
            format(k, th_knn, th_step, max_size, is_rebuild))

    # build knns
    # each node and it's top k nearest nodes also distancess
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(oprefix,
            '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\
            format(knn_method, k, th_knn, th_step, min_size, max_size))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, max_size)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, min_size)

    # output cluster proposals
    if is_save_proposals:
        ofolder = os.path.join(ofolder, 'proposals')
        print('saving cluster proposals to {}'.format(ofolder))
        if not os.path.exists(ofolder):
            os.makedirs(ofolder)
        save_proposals(clusters, knns, ofolder=ofolder, force=True)
Exemple #5
0
def generate_iter_proposals(oprefix,
                            knn_prefix,
                            feats,
                            feat_dim=256,
                            knn_method='faiss',
                            k=80,
                            th_knn=0.6,
                            th_step=0.05,
                            minsz=3,
                            maxsz=300,
                            sv_minsz=2,
                            sv_maxsz=5,
                            sv_labels=None,
                            sv_knn_prefix=None,
                            is_rebuild=False,
                            is_save_proposals=True,
                            force=False,
                            **kwargs):

    assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals"
    print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, '
          'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format(
              k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz,
              is_rebuild))

    if not os.path.exists(sv_labels):
        raise FileNotFoundError('{} not found.'.format(sv_labels))

    if sv_knn_prefix is None:
        sv_knn_prefix = knn_prefix

    # get iter and knns from super vertex path
    _iter = get_iter_from_path(sv_labels) + 1
    knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats)
    print('read sv_clusters from {}'.format(sv_labels))
    sv_lb2idxs, sv_idx2lb = read_meta(sv_labels)
    inst_num = len(sv_idx2lb)
    sv_clusters = labels2clusters(sv_lb2idxs)
    # sv_clusters = filter_clusters(sv_clusters, minsz)
    feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters])
    print('average feature of super vertices:', feats.shape)

    # build knns
    knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild)

    # obtain cluster proposals
    ofolder = os.path.join(
        oprefix,
        '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'.
        format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz,
               sv_maxsz, _iter))
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices (iter={})'.format(_iter)):
            clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz)
            clusters = filter_clusters(clusters, sv_minsz)
            clusters = [[x for c in cluster for x in sv_clusters[c]]
                        for cluster in clusters]
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels, inst_num=inst_num)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, minsz, maxsz)

    # output cluster proposals
    ofolder_proposals = os.path.join(ofolder, 'proposals')
    if is_save_proposals:
        print('saving cluster proposals to {}'.format(ofolder_proposals))
        if not os.path.exists(ofolder_proposals):
            os.makedirs(ofolder_proposals)
        save_proposals(clusters,
                       knns_inst,
                       ofolder=ofolder_proposals,
                       force=force)

    return ofolder_proposals, ofn_pred_labels
def chinese_whispers(feats, prefix, name, knn_method, knn, th_sim, iters,
                     **kwargs):
    """ Chinese Whispers Clustering Algorithm

    Paper: Chinese whispers: an efficient graph clustering algorithm
            and its application to natural language processing problems.
    Reference code:
        - http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/
        - https://github.com/zhly0/facenet-face-cluster-chinese-whispers-
    """
    import networkx as nx

    assert len(feats) > 1

    with Timer('create graph'):
        knn_prefix = os.path.join(prefix, 'knns', name)
        knns = build_knns(knn_prefix, feats, knn_method, knn)
        spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True)

        size = len(feats)
        nodes = [(n_i, {'cluster': n_i}) for n_i in range(size)]
        c = spmat.tocoo()
        edges = [(n_i, n_j, {
            'weight': s
        }) for n_i, n_j, s in zip(c.row, c.col, c.data)]

        G = nx.Graph()
        G.add_nodes_from(nodes)
        G.add_edges_from(edges)
        node_num = G.number_of_nodes()
        edge_num = G.number_of_edges()
        assert size == node_num
        print('#nodes: {}, #edges: {}'.format(node_num, edge_num))

    with Timer('whisper iteratively (iters={})'.format(iters)):
        cluster_nodes = list(G.nodes())
        for _ in range(iters):
            idxs = [i for i in range(node_num)]
            random.shuffle(idxs)
            for idx in idxs:
                node = cluster_nodes[idx]
                nbrs = G[node]
                if len(nbrs) == 0:
                    continue
                cluster2weight = {}
                for nbr in nbrs:
                    assigned_cluster = G.nodes[nbr]['cluster']
                    edge_weight = G[node][nbr]['weight']
                    if assigned_cluster not in cluster2weight:
                        cluster2weight[assigned_cluster] = 0
                    cluster2weight[assigned_cluster] += edge_weight

                # set the class of node to its neighbor with largest weight
                cluster2weight = sorted(cluster2weight.items(),
                                        key=lambda kv: kv[1],
                                        reverse=True)
                G.nodes[node]['cluster'] = cluster2weight[0][0]

    clusters = {}
    for (node, data) in G.nodes.items():
        assigned_cluster = data['cluster']

        if assigned_cluster not in clusters:
            clusters[assigned_cluster] = []
        clusters[assigned_cluster].append(node)

    print('#cluster: {}'.format(len(clusters)))
    labels = clusters2labels(clusters.values())
    labels = list(labels.values())

    return labels
Exemple #7
0
def generate_proposals(oprefix,
                       feats,
                       feat_dim=256,
                       knn_method='hnsw',
                       k=80,
                       th_knn=0.6,
                       th_step=0.05,
                       min_size=3,
                       max_size=300,
                       is_rebuild=False,
                       is_save_proposals=False):
    print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\
            format(k, th_knn, th_step, max_size, is_rebuild))

    ## knn retrieval
    oprefix = os.path.join(oprefix, '{}_k_{}'.format(knn_method, k))
    knn_fn = oprefix + '.npz'
    if not os.path.isfile(knn_fn) or is_rebuild:
        index_fn = oprefix + '.index'
        with Timer('build index'):
            if knn_method == 'hnsw':
                from proposals import knn_hnsw
                index = knn_hnsw(feats, k, index_fn)
            elif knn_method == 'faiss':
                from proposals import knn_faiss
                index = knn_faiss(feats, k, index_fn)
            else:
                raise KeyError('Unsupported method({}). \
                        Only support hnsw and faiss currently'.format(
                    knn_method))
            knns = index.get_knns()
        with Timer('dump knns to {}'.format(knn_fn)):
            dump_data(knn_fn, knns, force=True)
    else:
        print('read knn from {}'.format(knn_fn))
        knns = load_data(knn_fn)

    # obtain cluster proposals
    ofolder = oprefix + '_th_{}_step_{}_minsz_{}_maxsz_{}_iter0'.\
                format(th_knn, th_step, min_size, max_size)
    ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt')
    if not os.path.exists(ofolder):
        os.makedirs(ofolder)
    if not os.path.isfile(ofn_pred_labels) or is_rebuild:
        with Timer('build super vertices'):
            clusters = super_vertex(knns, k, th_knn, th_step, max_size)
        with Timer('dump clustering to {}'.format(ofn_pred_labels)):
            labels = clusters2labels(clusters)
            write_meta(ofn_pred_labels, labels)
    else:
        print('read clusters from {}'.format(ofn_pred_labels))
        lb2idxs, _ = read_meta(ofn_pred_labels)
        clusters = labels2clusters(lb2idxs)
    clusters = filter_clusters(clusters, min_size)

    # output cluster proposals
    if is_save_proposals:
        ofolder = os.path.join(ofolder, 'proposals')
        print('saving cluster proposals to {}'.format(ofolder))
        if not os.path.exists(ofolder):
            os.makedirs(ofolder)
        save_proposals(clusters, knns, ofolder=ofolder, force=True)