def test_lgcn(model, cfg, logger): for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.test_data) ofn_pred = os.path.join(cfg.work_dir, 'pred_edges_scores.npz') if os.path.isfile(ofn_pred) and not cfg.force: data = np.load(ofn_pred) edges = data['edges'] scores = data['scores'] inst_num = data['inst_num'] if inst_num != len(dataset): logger.warn( 'instance number in {} is different from dataset: {} vs {}'. format(ofn_pred, inst_num, len(dataset))) else: edges, scores, inst_num = test(model, dataset, cfg, logger) # produce predicted labels clusters = graph_clustering_dynamic_th(edges, scores, max_sz=cfg.max_sz, step=cfg.step, pool=cfg.pool) pred_idx2lb = clusters2labels(clusters) pred_labels = intdict2ndarray(pred_idx2lb) if cfg.save_output: print('save predicted edges and scores to {}'.format(ofn_pred)) np.savez_compressed(ofn_pred, edges=edges, scores=scores, inst_num=inst_num) ofn_meta = os.path.join(cfg.work_dir, 'pred_labels.txt') write_meta(ofn_meta, pred_idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') gt_labels = dataset.labels for metric in cfg.metrics: evaluate(gt_labels, pred_labels, metric) single_cluster_idxs = get_cluster_idxs(clusters, size=1) print('==> evaluation (removing {} single clusters)'.format( len(single_cluster_idxs))) remain_idxs = np.setdiff1d(np.arange(len(dataset)), np.array(single_cluster_idxs)) remain_idxs = np.array(remain_idxs) for metric in cfg.metrics: evaluate(gt_labels[remain_idxs], pred_labels[remain_idxs], metric)
def generate_basic_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, maxsz, is_rebuild)) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = osp.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format( knn_method, k, th_knn, th_step, minsz, maxsz)) ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt') if not osp.exists(ofolder): os.makedirs(ofolder) if not osp.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, maxsz) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz) # output cluster proposals ofolder_proposals = osp.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not osp.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def chinese_whispers_fast(feats, prefix, name, knn_method, knn, th_sim, iters, **kwargs): """ Chinese Whispers Clustering Algorithm Paper: Chinese whispers: an efficient graph clustering algorithm and its application to natural language processing problems. This implementation follows the matrix operation as described in Figure.4 int the paper. We switch the `maxrow` and `D^{t-1} * A_G` to make it easier for post-processing. The current result is inferior to `chinese_whispers` as it lacks of the random mechanism as the iterative algorithm. The paper introduce two operations to tackle this issue, namely `random mutation` and `keep class`. However, it is not very clear how to set this two hyper-parameters. """ assert len(feats) > 1 with Timer('create graph'): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) dists, nbrs = knns2ordered_nbrs(knns, sort=True) spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True) A = build_symmetric_adj(spmat, self_loop=False) node_num = len(feats) edge_num = A.nnz print('#nodes: {}, #edges: {}'.format(node_num, edge_num)) with Timer('whisper iteratively (iters={})'.format(iters)): D = identity(node_num) for _ in range(iters): D = D * A # it is equal to D.dot(A) D = _maxrow(D, node_num) assert D.nnz == node_num clusters = {} assigned_clusters = D.tocoo().col for (node, assigned_cluster) in enumerate(assigned_clusters): if assigned_cluster not in clusters: clusters[assigned_cluster] = [] clusters[assigned_cluster].append(node) print('#cluster: {}'.format(len(clusters))) labels = clusters2labels(clusters.values()) labels = list(labels.values()) return labels
def generate_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, min_size=3, max_size=300, is_rebuild=False, is_save_proposals=False): print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\ format(k, th_knn, th_step, max_size, is_rebuild)) # build knns # each node and it's top k nearest nodes also distancess knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join(oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\ format(knn_method, k, th_knn, th_step, min_size, max_size)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, max_size) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, min_size) # output cluster proposals if is_save_proposals: ofolder = os.path.join(ofolder, 'proposals') print('saving cluster proposals to {}'.format(ofolder)) if not os.path.exists(ofolder): os.makedirs(ofolder) save_proposals(clusters, knns, ofolder=ofolder, force=True)
def generate_iter_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, sv_minsz=2, sv_maxsz=5, sv_labels=None, sv_knn_prefix=None, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals" print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, ' 'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, is_rebuild)) if not os.path.exists(sv_labels): raise FileNotFoundError('{} not found.'.format(sv_labels)) if sv_knn_prefix is None: sv_knn_prefix = knn_prefix # get iter and knns from super vertex path _iter = get_iter_from_path(sv_labels) + 1 knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats) print('read sv_clusters from {}'.format(sv_labels)) sv_lb2idxs, sv_idx2lb = read_meta(sv_labels) inst_num = len(sv_idx2lb) sv_clusters = labels2clusters(sv_lb2idxs) # sv_clusters = filter_clusters(sv_clusters, minsz) feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters]) print('average feature of super vertices:', feats.shape) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'. format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, _iter)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices (iter={})'.format(_iter)): clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz) clusters = filter_clusters(clusters, sv_minsz) clusters = [[x for c in cluster for x in sv_clusters[c]] for cluster in clusters] with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels, inst_num=inst_num) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz, maxsz) # output cluster proposals ofolder_proposals = os.path.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not os.path.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns_inst, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def chinese_whispers(feats, prefix, name, knn_method, knn, th_sim, iters, **kwargs): """ Chinese Whispers Clustering Algorithm Paper: Chinese whispers: an efficient graph clustering algorithm and its application to natural language processing problems. Reference code: - http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/ - https://github.com/zhly0/facenet-face-cluster-chinese-whispers- """ import networkx as nx assert len(feats) > 1 with Timer('create graph'): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True) size = len(feats) nodes = [(n_i, {'cluster': n_i}) for n_i in range(size)] c = spmat.tocoo() edges = [(n_i, n_j, { 'weight': s }) for n_i, n_j, s in zip(c.row, c.col, c.data)] G = nx.Graph() G.add_nodes_from(nodes) G.add_edges_from(edges) node_num = G.number_of_nodes() edge_num = G.number_of_edges() assert size == node_num print('#nodes: {}, #edges: {}'.format(node_num, edge_num)) with Timer('whisper iteratively (iters={})'.format(iters)): cluster_nodes = list(G.nodes()) for _ in range(iters): idxs = [i for i in range(node_num)] random.shuffle(idxs) for idx in idxs: node = cluster_nodes[idx] nbrs = G[node] if len(nbrs) == 0: continue cluster2weight = {} for nbr in nbrs: assigned_cluster = G.nodes[nbr]['cluster'] edge_weight = G[node][nbr]['weight'] if assigned_cluster not in cluster2weight: cluster2weight[assigned_cluster] = 0 cluster2weight[assigned_cluster] += edge_weight # set the class of node to its neighbor with largest weight cluster2weight = sorted(cluster2weight.items(), key=lambda kv: kv[1], reverse=True) G.nodes[node]['cluster'] = cluster2weight[0][0] clusters = {} for (node, data) in G.nodes.items(): assigned_cluster = data['cluster'] if assigned_cluster not in clusters: clusters[assigned_cluster] = [] clusters[assigned_cluster].append(node) print('#cluster: {}'.format(len(clusters))) labels = clusters2labels(clusters.values()) labels = list(labels.values()) return labels
def generate_proposals(oprefix, feats, feat_dim=256, knn_method='hnsw', k=80, th_knn=0.6, th_step=0.05, min_size=3, max_size=300, is_rebuild=False, is_save_proposals=False): print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\ format(k, th_knn, th_step, max_size, is_rebuild)) ## knn retrieval oprefix = os.path.join(oprefix, '{}_k_{}'.format(knn_method, k)) knn_fn = oprefix + '.npz' if not os.path.isfile(knn_fn) or is_rebuild: index_fn = oprefix + '.index' with Timer('build index'): if knn_method == 'hnsw': from proposals import knn_hnsw index = knn_hnsw(feats, k, index_fn) elif knn_method == 'faiss': from proposals import knn_faiss index = knn_faiss(feats, k, index_fn) else: raise KeyError('Unsupported method({}). \ Only support hnsw and faiss currently'.format( knn_method)) knns = index.get_knns() with Timer('dump knns to {}'.format(knn_fn)): dump_data(knn_fn, knns, force=True) else: print('read knn from {}'.format(knn_fn)) knns = load_data(knn_fn) # obtain cluster proposals ofolder = oprefix + '_th_{}_step_{}_minsz_{}_maxsz_{}_iter0'.\ format(th_knn, th_step, min_size, max_size) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, max_size) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, min_size) # output cluster proposals if is_save_proposals: ofolder = os.path.join(ofolder, 'proposals') print('saving cluster proposals to {}'.format(ofolder)) if not os.path.exists(ofolder): os.makedirs(ofolder) save_proposals(clusters, knns, ofolder=ofolder, force=True)