def __init__(self, features, labels, cluster_features=None, k=10, levels=1, faiss_gpu=False): self.k = k self.gs = [] self.nbrs = [] self.dists = [] self.levels = levels # Initialize features and labels features = l2norm(features.astype('float32')) global_features = features.copy() if cluster_features is None: cluster_features = features global_num_nodes = features.shape[0] global_edges = ([], []) global_peaks = np.array([], dtype=np.long) ids = np.arange(global_num_nodes) # Recursive graph construction for lvl in range(self.levels): if features.shape[0] <= self.k: self.levels = lvl break if faiss_gpu: knns = build_knns(features, self.k, 'faiss_gpu') else: knns = build_knns(features, self.k, 'faiss') dists, nbrs = knns2ordered_nbrs(knns) self.nbrs.append(nbrs) self.dists.append(dists) density = density_estimation(dists, nbrs, labels) g = self._build_graph(features, cluster_features, labels, density, knns) self.gs.append(g) if lvl >= self.levels - 1: break # Decode peak nodes new_pred_labels, peaks,\ global_edges, global_pred_labels, global_peaks = decode(g, 0, 'sim', True, ids, global_edges, global_num_nodes, global_peaks) ids = ids[peaks] features, labels, cluster_features = build_next_level(features, labels, peaks, global_features, global_pred_labels, global_peaks)
def knn_dbscan(feats, eps, min_samples, prefix, name, knn_method, knn, th_sim, **kwargs): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) sparse_affinity = knns2spmat(knns, knn, th_sim) db = cluster.DBSCAN(eps=eps, min_samples=min_samples, n_jobs=mp.cpu_count(), metric='precomputed').fit(sparse_affinity) return db.labels_
def generate_basic_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): print('k={}, th_knn={}, th_step={}, maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, maxsz, is_rebuild)) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = osp.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.format( knn_method, k, th_knn, th_step, minsz, maxsz)) ofn_pred_labels = osp.join(ofolder, 'pred_labels.txt') if not osp.exists(ofolder): os.makedirs(ofolder) if not osp.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, maxsz) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz) # output cluster proposals ofolder_proposals = osp.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not osp.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def chinese_whispers_fast(feats, prefix, name, knn_method, knn, th_sim, iters, **kwargs): """ Chinese Whispers Clustering Algorithm Paper: Chinese whispers: an efficient graph clustering algorithm and its application to natural language processing problems. This implementation follows the matrix operation as described in Figure.4 int the paper. We switch the `maxrow` and `D^{t-1} * A_G` to make it easier for post-processing. The current result is inferior to `chinese_whispers` as it lacks of the random mechanism as the iterative algorithm. The paper introduce two operations to tackle this issue, namely `random mutation` and `keep class`. However, it is not very clear how to set this two hyper-parameters. """ assert len(feats) > 1 with Timer('create graph'): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) dists, nbrs = knns2ordered_nbrs(knns, sort=True) spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True) A = build_symmetric_adj(spmat, self_loop=False) node_num = len(feats) edge_num = A.nnz print('#nodes: {}, #edges: {}'.format(node_num, edge_num)) with Timer('whisper iteratively (iters={})'.format(iters)): D = identity(node_num) for _ in range(iters): D = D * A # it is equal to D.dot(A) D = _maxrow(D, node_num) assert D.nnz == node_num clusters = {} assigned_clusters = D.tocoo().col for (node, assigned_cluster) in enumerate(assigned_clusters): if assigned_cluster not in clusters: clusters[assigned_cluster] = [] clusters[assigned_cluster].append(node) print('#cluster: {}'.format(len(clusters))) labels = clusters2labels(clusters.values()) labels = list(labels.values()) return labels
def generate_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, min_size=3, max_size=300, is_rebuild=False, is_save_proposals=False): print('k={}, th_knn={}, th_step={}, max_size={}, is_rebuild={}'.\ format(k, th_knn, th_step, max_size, is_rebuild)) # build knns # each node and it's top k nearest nodes also distancess knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join(oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_iter_0'.\ format(knn_method, k, th_knn, th_step, min_size, max_size)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices'): clusters = super_vertex(knns, k, th_knn, th_step, max_size) with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, min_size) # output cluster proposals if is_save_proposals: ofolder = os.path.join(ofolder, 'proposals') print('saving cluster proposals to {}'.format(ofolder)) if not os.path.exists(ofolder): os.makedirs(ofolder) save_proposals(clusters, knns, ofolder=ofolder, force=True)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.ignore_ratio = cfg.get('ignore_ratio', 0.8) self.ignore_small_confs = cfg.get('ignore_small_confs', True) self.use_candidate_set = cfg.get('use_candidate_set', True) self.nproc = cfg.get('nproc', 1) self.max_qsize = cfg.get('max_qsize', int(1e5)) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = self.inst_num assert self.size == self.features.shape[0] print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) with Timer('read knn graph'): if knn_graph_path is not None: knns = np.load(knn_graph_path)['data'] else: prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) assert self.inst_num == len(knns), "{} vs {}".format( self.inst_num, len(knns)) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) self.adj = row_normalize(adj) # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns, sort=True) if cfg.pred_confs != '': print('read estimated confidence from {}'.format( cfg.pred_confs)) self.confs = np.load(cfg.pred_confs)['pred_confs'] else: print('use unsupervised density as confidence') assert self.radius from vegcn.confidence import density self.confs = density(self.dists, radius=self.radius) assert 0 <= self.ignore_ratio <= 1 if self.ignore_ratio == 1: self.ignore_set = set(np.arange(len(self.confs))) else: num = int(len(self.confs) * self.ignore_ratio) confs = self.confs if not self.ignore_small_confs: confs = -confs self.ignore_set = set(np.argpartition(confs, num)[:num]) print( 'ignore_ratio: {}, ignore_small_confs: {}, use_candidate_set: {}'. format(self.ignore_ratio, self.ignore_small_confs, self.use_candidate_set)) print('#ignore_set: {} / {} = {:.3f}'.format( len(self.ignore_set), self.inst_num, 1. * len(self.ignore_set) / self.inst_num)) with Timer('Prepare sub-graphs'): # construct subgraphs with larger confidence self.peaks = {i: [] for i in range(self.inst_num)} self.dist2peak = {i: [] for i in range(self.inst_num)} if self.nproc > 1: # multi-process import multiprocessing as mp pool = mp.Pool(self.nproc) results = [] num = int(self.inst_num / self.max_qsize) + 1 for i in tqdm(range(num)): beg = int(i * self.max_qsize) end = min(beg + self.max_qsize, self.inst_num) lst = [j for j in range(beg, end)] results.extend( list( tqdm(pool.map(self.get_subgraph, lst), total=len(lst)))) pool.close() pool.join() else: results = [ self.get_subgraph(i) for i in tqdm(range(self.inst_num)) ] self.adj_lst = [] self.feat_lst = [] self.lb_lst = [] self.subset_gt_labels = [] self.subset_idxs = [] self.subset_nbrs = [] self.subset_dists = [] for result in results: if result is None: continue elif len(result) == 3: i, nbr, dist = result self.peaks[i].extend(nbr) self.dist2peak[i].extend(dist) continue i, nbr, dist, feat, adj, lb = result self.subset_idxs.append(i) self.subset_nbrs.append(nbr) self.subset_dists.append(dist) self.feat_lst.append(feat) self.adj_lst.append(adj) if not self.ignore_label: self.subset_gt_labels.append(self.idx2lb[i]) self.lb_lst.append(lb) self.subset_gt_labels = np.array(self.subset_gt_labels) self.size = len(self.feat_lst) assert self.size == len(self.adj_lst) if not self.ignore_label: assert self.size == len(self.lb_lst)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def generate_iter_proposals(oprefix, knn_prefix, feats, feat_dim=256, knn_method='faiss', k=80, th_knn=0.6, th_step=0.05, minsz=3, maxsz=300, sv_minsz=2, sv_maxsz=5, sv_labels=None, sv_knn_prefix=None, is_rebuild=False, is_save_proposals=True, force=False, **kwargs): assert sv_minsz >= 2, "sv_minsz >= 2 to avoid duplicated proposals" print('k={}, th_knn={}, th_step={}, minsz={}, maxsz={}, ' 'sv_minsz={}, sv_maxsz={}, is_rebuild={}'.format( k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, is_rebuild)) if not os.path.exists(sv_labels): raise FileNotFoundError('{} not found.'.format(sv_labels)) if sv_knn_prefix is None: sv_knn_prefix = knn_prefix # get iter and knns from super vertex path _iter = get_iter_from_path(sv_labels) + 1 knns_inst = get_knns_from_path(sv_labels, sv_knn_prefix, feats) print('read sv_clusters from {}'.format(sv_labels)) sv_lb2idxs, sv_idx2lb = read_meta(sv_labels) inst_num = len(sv_idx2lb) sv_clusters = labels2clusters(sv_lb2idxs) # sv_clusters = filter_clusters(sv_clusters, minsz) feats = np.array([feats[c, :].mean(axis=0) for c in sv_clusters]) print('average feature of super vertices:', feats.shape) # build knns knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild) # obtain cluster proposals ofolder = os.path.join( oprefix, '{}_k_{}_th_{}_step_{}_minsz_{}_maxsz_{}_sv_minsz_{}_maxsz_{}_iter_{}'. format(knn_method, k, th_knn, th_step, minsz, maxsz, sv_minsz, sv_maxsz, _iter)) ofn_pred_labels = os.path.join(ofolder, 'pred_labels.txt') if not os.path.exists(ofolder): os.makedirs(ofolder) if not os.path.isfile(ofn_pred_labels) or is_rebuild: with Timer('build super vertices (iter={})'.format(_iter)): clusters = super_vertex(knns, k, th_knn, th_step, sv_maxsz) clusters = filter_clusters(clusters, sv_minsz) clusters = [[x for c in cluster for x in sv_clusters[c]] for cluster in clusters] with Timer('dump clustering to {}'.format(ofn_pred_labels)): labels = clusters2labels(clusters) write_meta(ofn_pred_labels, labels, inst_num=inst_num) else: print('read clusters from {}'.format(ofn_pred_labels)) lb2idxs, _ = read_meta(ofn_pred_labels) clusters = labels2clusters(lb2idxs) clusters = filter_clusters(clusters, minsz, maxsz) # output cluster proposals ofolder_proposals = os.path.join(ofolder, 'proposals') if is_save_proposals: print('saving cluster proposals to {}'.format(ofolder_proposals)) if not os.path.exists(ofolder_proposals): os.makedirs(ofolder_proposals) save_proposals(clusters, knns_inst, ofolder=ofolder_proposals, force=force) return ofolder_proposals, ofn_pred_labels
def get_knns_from_path(s, knn_prefix, feats): dic = parse_path(s) k = int(dic['k']) knn_method = dic['knn_method'] knns = build_knns(knn_prefix, feats, knn_method, k, is_rebuild=False) return knns
def test_gcn_v(model, cfg, logger): for k, v in cfg.model['kwargs'].items(): setattr(cfg.test_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.test_data) folder = '{}_gcnv_k_{}_th_{}'.format(cfg.test_name, cfg.knn, cfg.th_sim) oprefix = osp.join(cfg.work_dir, folder) oname = osp.basename(rm_suffix(cfg.load_from)) opath_pred_confs = osp.join(oprefix, 'pred_confs', '{}.npz'.format(oname)) if osp.isfile(opath_pred_confs) and not cfg.force: data = np.load(opath_pred_confs) pred_confs = data['pred_confs'] inst_num = data['inst_num'] if inst_num != dataset.inst_num: logger.warn( 'instance number in {} is different from dataset: {} vs {}'. format(opath_pred_confs, inst_num, len(dataset))) else: pred_confs, gcn_feat = test(model, dataset, cfg, logger) inst_num = dataset.inst_num logger.info('pred_confs: mean({:.4f}). max({:.4f}), min({:.4f})'.format( pred_confs.mean(), pred_confs.max(), pred_confs.min())) logger.info('Convert to cluster') with Timer('Predition to peaks'): pred_dist2peak, pred_peaks = confidence_to_peaks( dataset.dists, dataset.nbrs, pred_confs, cfg.max_conn) if not dataset.ignore_label and cfg.eval_interim: # evaluate the intermediate results for i in range(cfg.max_conn): num = len(dataset.peaks) pred_peaks_i = np.arange(num) peaks_i = np.arange(num) for j in range(num): if len(pred_peaks[j]) > i: pred_peaks_i[j] = pred_peaks[j][i] if len(dataset.peaks[j]) > i: peaks_i[j] = dataset.peaks[j][i] acc = accuracy(pred_peaks_i, peaks_i) logger.info('[{}-th conn] accuracy of peak match: {:.4f}'.format( i + 1, acc)) acc = 0. for idx, peak in enumerate(pred_peaks_i): acc += int(dataset.idx2lb[peak] == dataset.idx2lb[idx]) acc /= len(pred_peaks_i) logger.info( '[{}-th conn] accuracy of peak label match: {:.4f}'.format( i + 1, acc)) with Timer('Peaks to clusters (th_cut={})'.format(cfg.tau_0)): pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau_0, inst_num) if cfg.save_output: logger.info('save predicted confs to {}'.format(opath_pred_confs)) mkdir_if_no_exists(opath_pred_confs) np.savez_compressed(opath_pred_confs, pred_confs=pred_confs, inst_num=inst_num) # save clustering results idx2lb = list2dict(pred_labels, ignore_value=-1) opath_pred_labels = osp.join( cfg.work_dir, folder, 'tau_{}_pred_labels.txt'.format(cfg.tau_0)) logger.info('save predicted labels to {}'.format(opath_pred_labels)) mkdir_if_no_exists(opath_pred_labels) write_meta(opath_pred_labels, idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') for metric in cfg.metrics: evaluate(dataset.gt_labels, pred_labels, metric) if cfg.use_gcn_feat: # gcn_feat is saved to disk for GCN-E opath_feat = osp.join(oprefix, 'features', '{}.bin'.format(oname)) if not osp.isfile(opath_feat) or cfg.force: mkdir_if_no_exists(opath_feat) write_feat(opath_feat, gcn_feat) name = rm_suffix(osp.basename(opath_feat)) prefix = oprefix ds = BasicDataset(name=name, prefix=prefix, dim=cfg.model['kwargs']['nhid'], normalize=True) ds.info() # use top embedding of GCN to rebuild the kNN graph with Timer('connect to higher confidence with use_gcn_feat'): knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, ds.features, cfg.knn_method, cfg.knn, is_rebuild=True) dists, nbrs = knns2ordered_nbrs(knns) pred_dist2peak, pred_peaks = confidence_to_peaks( dists, nbrs, pred_confs, cfg.max_conn) pred_labels = peaks_to_labels(pred_peaks, pred_dist2peak, cfg.tau, inst_num) # save clustering results if cfg.save_output: oname_meta = '{}_gcn_feat'.format(name) opath_pred_labels = osp.join( oprefix, oname_meta, 'tau_{}_pred_labels.txt'.format(cfg.tau)) mkdir_if_no_exists(opath_pred_labels) idx2lb = list2dict(pred_labels, ignore_value=-1) write_meta(opath_pred_labels, idx2lb, inst_num=inst_num) # evaluation if not dataset.ignore_label: print('==> evaluation') for metric in cfg.metrics: evaluate(dataset.gt_labels, pred_labels, metric) import json import os import pdb pdb.set_trace() img_labels = json.load( open(r'/home/finn/research/data/clustering_data/test_index.json', 'r', encoding='utf-8')) import shutil output = r'/home/finn/research/data/clustering_data/mr_gcn_output' for label in set(pred_labels): if not os.path.exists(os.path.join(output, f'cluter_{label}')): os.mkdir(os.path.join(output, f'cluter_{label}')) for image in img_labels: shutil.copy2( image, os.path.join( os.path.join(output, f'cluter_{pred_labels[img_labels[image]]}'), os.path.split(image)[-1]))
ds = BasicDataset(name=args.name, prefix=args.prefix, dim=args.dim, normalize=args.no_normalize) ds.info() with Timer('[{}] build_knns'.format(args.knn_method)): if args.num_process is None: import multiprocessing as mp args.num_process = mp.cpu_count() print('use {} CPU for computation'.format(args.num_process)) knn_prefix = os.path.join(args.prefix, 'knns', args.name) knns = build_knns(knn_prefix, ds.features, args.knn_method, args.knn, num_process=args.num_process) if args.test_all: with Timer('knns2spmat'): adj1 = knns2spmat(knns, args.knn, args.th_sim, use_sim=True) with Timer('fast_knns2spmat'): adj2 = fast_knns2spmat(knns, args.knn, args.th_sim, use_sim=True) print('#adj: {}, #adj2: {}, #non-eq: {}'.format( adj1.nnz, adj2.nnz, (adj1 != adj2).nnz)) assert is_spmat_eq(adj1, adj2), "adj1 and adj2 are not equal" print('Output of knns2spmat and fast_knns2spmat are equal')
def chinese_whispers(feats, prefix, name, knn_method, knn, th_sim, iters, **kwargs): """ Chinese Whispers Clustering Algorithm Paper: Chinese whispers: an efficient graph clustering algorithm and its application to natural language processing problems. Reference code: - http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/ - https://github.com/zhly0/facenet-face-cluster-chinese-whispers- """ import networkx as nx assert len(feats) > 1 with Timer('create graph'): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True) size = len(feats) nodes = [(n_i, {'cluster': n_i}) for n_i in range(size)] c = spmat.tocoo() edges = [(n_i, n_j, { 'weight': s }) for n_i, n_j, s in zip(c.row, c.col, c.data)] G = nx.Graph() G.add_nodes_from(nodes) G.add_edges_from(edges) node_num = G.number_of_nodes() edge_num = G.number_of_edges() assert size == node_num print('#nodes: {}, #edges: {}'.format(node_num, edge_num)) with Timer('whisper iteratively (iters={})'.format(iters)): cluster_nodes = list(G.nodes()) for _ in range(iters): idxs = [i for i in range(node_num)] random.shuffle(idxs) for idx in idxs: node = cluster_nodes[idx] nbrs = G[node] if len(nbrs) == 0: continue cluster2weight = {} for nbr in nbrs: assigned_cluster = G.nodes[nbr]['cluster'] edge_weight = G[node][nbr]['weight'] if assigned_cluster not in cluster2weight: cluster2weight[assigned_cluster] = 0 cluster2weight[assigned_cluster] += edge_weight # set the class of node to its neighbor with largest weight cluster2weight = sorted(cluster2weight.items(), key=lambda kv: kv[1], reverse=True) G.nodes[node]['cluster'] = cluster2weight[0][0] clusters = {} for (node, data) in G.nodes.items(): assigned_cluster = data['cluster'] if assigned_cluster not in clusters: clusters[assigned_cluster] = [] clusters[assigned_cluster].append(node) print('#cluster: {}'.format(len(clusters))) labels = clusters2labels(clusters.values()) labels = list(labels.values()) return labels
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') self.num_process = cfg.get('num_process',16) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] # num_imgs*2*k else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) # 通过faiss实现k近邻搜索,此处作者faiss_gpu版本实现可能有问题,但faiss大规模在cpu上跑还是慢 # 当然faiss有针内存和计算速度方面的优化,PQ,IVF等,可参考faiss knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn,self.num_process) # 依据k近邻搜索结果构建邻接矩阵 adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) # num_imgs*k print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def train_gcn(model, cfg, logger): # prepare dataset for k, v in cfg.model['kwargs'].items(): setattr(cfg.train_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.train_data) pre_features = torch.FloatTensor(dataset.features) print('Have loaded the training data.') inst_num = dataset.inst_num feature_dim = dataset.feature_dim lb2idxs = dataset.lb2idxs center_fea = dataset.center_fea.astype('float32') cls_num, dim = center_fea.shape labels = torch.LongTensor(dataset.gt_labels) HEAD1 = HEAD(nhid=512) HEAD_test1 = HEAD_test(nhid=512) #load parameters from the pretrained model #model.load_state_dict(torch.load('./')) #HEAD1.load_state_dict(torch.load('./'), False) OPTIMIZER = optim.SGD([{'params': model.parameters(),'weight_decay':1e-5}, {'params': HEAD1.parameters(),'weight_decay':1e-5}], lr=0.01, momentum=0.9) print('the learning rate is 0.01') #model.load_state_dict(torch.load('')) #HEAD1.load_state_dict(torch.load('')) print("have load the pretrained model.") cfg.cuda = True model = model.cuda() HEAD1 = HEAD1.cuda() MODEL_ROOT = './src/train_model' print('the model save path is', MODEL_ROOT) #prepare the test data target = "part1_test" knn_path = "./data/knns/" + target + "/faiss_k_80.npz" knns = np.load(knn_path, allow_pickle=True)['data'] inst_num = knns.shape[0] k_num = knns.shape[2] nbrs = knns[:, 0, :] pair_a = [] pair_b = [] for i in range(inst_num): pair_a.extend([i] * k_num) pair_b.extend(nbrs[i]) for epoch in range(cfg.total_epochs): if epoch == cfg.STAGES[0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[1]: schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[2]: schedule_lr(OPTIMIZER) model.train() HEAD1.train() index = faiss.IndexFlatIP(dim) index.add(center_fea) sims, cluster_id = index.search(center_fea, k=(cfg.cluster_num+200)) # search for the k-10 neighbor #sims, cluster_id = index.search(center_fea, k=cfg.cluster_num) # search for the k-10 neighbor print('Have selected the cluster ids.') for batch in range(cls_num): #for batch in range(20): #0.select ids sample_cluster_id = random.sample(list(cluster_id[batch]), cfg.cluster_num) #sample_cluster_id = list(cluster_id[batch]) sample_id = []#the idx of the samples in this batch for i in range(len(sample_cluster_id)): sample_id.extend(random.sample(lb2idxs[sample_cluster_id[i]],int(len(lb2idxs[sample_cluster_id[i]])*0.9))) #sample_id.extend(lb2idxs[sample_cluster_id[i]]) #sample_id.sort() sample_num =len(sample_id) #id = list(np.arange(0,sample_num,1)) #sample2sort = dict(zip(sample_id, id)) if (sample_num>100000)|(sample_num<100): print('[too much samples] continue.') continue #1.create selected labels and images batch_labels = labels[sample_id] feature = pre_features[sample_id] print(sample_num) #2.create knn for this batch with Timer('build knn:'): knn_prefix = os.path.join("./data/rebuild_knn") if not os.path.exists(knn_prefix): os.makedirs(knn_prefix) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.npz')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.npz')) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.index')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.index')) knns = build_knns(knn_prefix, #l2norm(feature.clone().detach().cpu().numpy()), l2norm(feature.numpy()), "faiss", 80, is_rebuild=True) batch_adj = fast_knns2spmat(knns, 80, 0, use_sim=True) batch_adj = build_symmetric_adj(batch_adj, self_loop=True) batch_adj = row_normalize(batch_adj) batch_adj = sparse_mx_to_torch_sparse_tensor(batch_adj, return_idx=False) #3.put selected feature and labels to cuda batch_labels = batch_labels.cuda() feature = feature.cuda() batch_adj = batch_adj.cuda() train_data = [feature, batch_adj, batch_labels] #x = model(train_data) #4.train the model #add train_id_inst = batch_adj._indices().size()[1] #print('train_id_inst:', train_id_inst) #print('sample_num:', sample_num) #train_id_inst = sample_num rad_id = random.sample(range(0, train_id_inst), train_id_inst)+random.sample(range(0, train_id_inst), train_id_inst) patch_num = 40 for i in range(patch_num*2): id = rad_id[i * int(train_id_inst / patch_num):(i + 1) * int(train_id_inst / patch_num)] x = model(train_data) loss = HEAD1(x, train_data, id) OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() print(datetime.datetime.now()) print('epoch:{}/{}, batch:{}/{}, batch2:{}/{},loss:{}'.format(epoch, cfg.total_epochs, batch, cls_num, i, patch_num*2, loss)) if (batch+1)%100==0: if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) if (batch + 1) % 300 == 0: avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) model.train() HEAD1.train() #5.test avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) # 6.save model if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.cls_num = len(self.lb2idxs) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('Compute center feature'): self.center_fea = np.zeros((self.cls_num, self.features.shape[1])) for i in range(self.cls_num): self.center_fea[i] = np.mean(self.features[self.lb2idxs[i]], 0) self.center_fea = l2norm(self.center_fea) with Timer('read knn graph'): if os.path.isfile(knn_graph_path): print("load knns from the knn_path") self.knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) self.knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(self.knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) #print('adj before norm') #print(adj) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(self.knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat))