def chinese_whispers_fast(feats, prefix, name, knn_method, knn, th_sim, iters, **kwargs): """ Chinese Whispers Clustering Algorithm Paper: Chinese whispers: an efficient graph clustering algorithm and its application to natural language processing problems. This implementation follows the matrix operation as described in Figure.4 int the paper. We switch the `maxrow` and `D^{t-1} * A_G` to make it easier for post-processing. The current result is inferior to `chinese_whispers` as it lacks of the random mechanism as the iterative algorithm. The paper introduce two operations to tackle this issue, namely `random mutation` and `keep class`. However, it is not very clear how to set this two hyper-parameters. """ assert len(feats) > 1 with Timer('create graph'): knn_prefix = os.path.join(prefix, 'knns', name) knns = build_knns(knn_prefix, feats, knn_method, knn) dists, nbrs = knns2ordered_nbrs(knns, sort=True) spmat = fast_knns2spmat(knns, knn, th_sim, use_sim=True) A = build_symmetric_adj(spmat, self_loop=False) node_num = len(feats) edge_num = A.nnz print('#nodes: {}, #edges: {}'.format(node_num, edge_num)) with Timer('whisper iteratively (iters={})'.format(iters)): D = identity(node_num) for _ in range(iters): D = D * A # it is equal to D.dot(A) D = _maxrow(D, node_num) assert D.nnz == node_num clusters = {} assigned_clusters = D.tocoo().col for (node, assigned_cluster) in enumerate(assigned_clusters): if assigned_cluster not in clusters: clusters[assigned_cluster] = [] clusters[assigned_cluster].append(node) print('#cluster: {}'.format(len(clusters))) labels = clusters2labels(clusters.values()) labels = list(labels.values()) return labels
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.ignore_ratio = cfg.get('ignore_ratio', 0.8) self.ignore_small_confs = cfg.get('ignore_small_confs', True) self.use_candidate_set = cfg.get('use_candidate_set', True) self.nproc = cfg.get('nproc', 1) self.max_qsize = cfg.get('max_qsize', int(1e5)) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = self.inst_num assert self.size == self.features.shape[0] print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) with Timer('read knn graph'): if knn_graph_path is not None: knns = np.load(knn_graph_path)['data'] else: prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) assert self.inst_num == len(knns), "{} vs {}".format( self.inst_num, len(knns)) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) self.adj = row_normalize(adj) # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns, sort=True) if cfg.pred_confs != '': print('read estimated confidence from {}'.format( cfg.pred_confs)) self.confs = np.load(cfg.pred_confs)['pred_confs'] else: print('use unsupervised density as confidence') assert self.radius from vegcn.confidence import density self.confs = density(self.dists, radius=self.radius) assert 0 <= self.ignore_ratio <= 1 if self.ignore_ratio == 1: self.ignore_set = set(np.arange(len(self.confs))) else: num = int(len(self.confs) * self.ignore_ratio) confs = self.confs if not self.ignore_small_confs: confs = -confs self.ignore_set = set(np.argpartition(confs, num)[:num]) print( 'ignore_ratio: {}, ignore_small_confs: {}, use_candidate_set: {}'. format(self.ignore_ratio, self.ignore_small_confs, self.use_candidate_set)) print('#ignore_set: {} / {} = {:.3f}'.format( len(self.ignore_set), self.inst_num, 1. * len(self.ignore_set) / self.inst_num)) with Timer('Prepare sub-graphs'): # construct subgraphs with larger confidence self.peaks = {i: [] for i in range(self.inst_num)} self.dist2peak = {i: [] for i in range(self.inst_num)} if self.nproc > 1: # multi-process import multiprocessing as mp pool = mp.Pool(self.nproc) results = [] num = int(self.inst_num / self.max_qsize) + 1 for i in tqdm(range(num)): beg = int(i * self.max_qsize) end = min(beg + self.max_qsize, self.inst_num) lst = [j for j in range(beg, end)] results.extend( list( tqdm(pool.map(self.get_subgraph, lst), total=len(lst)))) pool.close() pool.join() else: results = [ self.get_subgraph(i) for i in tqdm(range(self.inst_num)) ] self.adj_lst = [] self.feat_lst = [] self.lb_lst = [] self.subset_gt_labels = [] self.subset_idxs = [] self.subset_nbrs = [] self.subset_dists = [] for result in results: if result is None: continue elif len(result) == 3: i, nbr, dist = result self.peaks[i].extend(nbr) self.dist2peak[i].extend(dist) continue i, nbr, dist, feat, adj, lb = result self.subset_idxs.append(i) self.subset_nbrs.append(nbr) self.subset_dists.append(dist) self.feat_lst.append(feat) self.adj_lst.append(adj) if not self.ignore_label: self.subset_gt_labels.append(self.idx2lb[i]) self.lb_lst.append(lb) self.subset_gt_labels = np.array(self.subset_gt_labels) self.size = len(self.feat_lst) assert self.size == len(self.adj_lst) if not self.ignore_label: assert self.size == len(self.lb_lst)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') self.num_process = cfg.get('num_process',16) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] # num_imgs*2*k else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) # 通过faiss实现k近邻搜索,此处作者faiss_gpu版本实现可能有问题,但faiss大规模在cpu上跑还是慢 # 当然faiss有针内存和计算速度方面的优化,PQ,IVF等,可参考faiss knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn,self.num_process) # 依据k近邻搜索结果构建邻接矩阵 adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) # num_imgs*k print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def train_gcn(model, cfg, logger): # prepare dataset for k, v in cfg.model['kwargs'].items(): setattr(cfg.train_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.train_data) pre_features = torch.FloatTensor(dataset.features) print('Have loaded the training data.') inst_num = dataset.inst_num feature_dim = dataset.feature_dim lb2idxs = dataset.lb2idxs center_fea = dataset.center_fea.astype('float32') cls_num, dim = center_fea.shape labels = torch.LongTensor(dataset.gt_labels) HEAD1 = HEAD(nhid=512) HEAD_test1 = HEAD_test(nhid=512) #load parameters from the pretrained model #model.load_state_dict(torch.load('./')) #HEAD1.load_state_dict(torch.load('./'), False) OPTIMIZER = optim.SGD([{'params': model.parameters(),'weight_decay':1e-5}, {'params': HEAD1.parameters(),'weight_decay':1e-5}], lr=0.01, momentum=0.9) print('the learning rate is 0.01') #model.load_state_dict(torch.load('')) #HEAD1.load_state_dict(torch.load('')) print("have load the pretrained model.") cfg.cuda = True model = model.cuda() HEAD1 = HEAD1.cuda() MODEL_ROOT = './src/train_model' print('the model save path is', MODEL_ROOT) #prepare the test data target = "part1_test" knn_path = "./data/knns/" + target + "/faiss_k_80.npz" knns = np.load(knn_path, allow_pickle=True)['data'] inst_num = knns.shape[0] k_num = knns.shape[2] nbrs = knns[:, 0, :] pair_a = [] pair_b = [] for i in range(inst_num): pair_a.extend([i] * k_num) pair_b.extend(nbrs[i]) for epoch in range(cfg.total_epochs): if epoch == cfg.STAGES[0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[1]: schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[2]: schedule_lr(OPTIMIZER) model.train() HEAD1.train() index = faiss.IndexFlatIP(dim) index.add(center_fea) sims, cluster_id = index.search(center_fea, k=(cfg.cluster_num+200)) # search for the k-10 neighbor #sims, cluster_id = index.search(center_fea, k=cfg.cluster_num) # search for the k-10 neighbor print('Have selected the cluster ids.') for batch in range(cls_num): #for batch in range(20): #0.select ids sample_cluster_id = random.sample(list(cluster_id[batch]), cfg.cluster_num) #sample_cluster_id = list(cluster_id[batch]) sample_id = []#the idx of the samples in this batch for i in range(len(sample_cluster_id)): sample_id.extend(random.sample(lb2idxs[sample_cluster_id[i]],int(len(lb2idxs[sample_cluster_id[i]])*0.9))) #sample_id.extend(lb2idxs[sample_cluster_id[i]]) #sample_id.sort() sample_num =len(sample_id) #id = list(np.arange(0,sample_num,1)) #sample2sort = dict(zip(sample_id, id)) if (sample_num>100000)|(sample_num<100): print('[too much samples] continue.') continue #1.create selected labels and images batch_labels = labels[sample_id] feature = pre_features[sample_id] print(sample_num) #2.create knn for this batch with Timer('build knn:'): knn_prefix = os.path.join("./data/rebuild_knn") if not os.path.exists(knn_prefix): os.makedirs(knn_prefix) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.npz')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.npz')) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.index')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.index')) knns = build_knns(knn_prefix, #l2norm(feature.clone().detach().cpu().numpy()), l2norm(feature.numpy()), "faiss", 80, is_rebuild=True) batch_adj = fast_knns2spmat(knns, 80, 0, use_sim=True) batch_adj = build_symmetric_adj(batch_adj, self_loop=True) batch_adj = row_normalize(batch_adj) batch_adj = sparse_mx_to_torch_sparse_tensor(batch_adj, return_idx=False) #3.put selected feature and labels to cuda batch_labels = batch_labels.cuda() feature = feature.cuda() batch_adj = batch_adj.cuda() train_data = [feature, batch_adj, batch_labels] #x = model(train_data) #4.train the model #add train_id_inst = batch_adj._indices().size()[1] #print('train_id_inst:', train_id_inst) #print('sample_num:', sample_num) #train_id_inst = sample_num rad_id = random.sample(range(0, train_id_inst), train_id_inst)+random.sample(range(0, train_id_inst), train_id_inst) patch_num = 40 for i in range(patch_num*2): id = rad_id[i * int(train_id_inst / patch_num):(i + 1) * int(train_id_inst / patch_num)] x = model(train_data) loss = HEAD1(x, train_data, id) OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() print(datetime.datetime.now()) print('epoch:{}/{}, batch:{}/{}, batch2:{}/{},loss:{}'.format(epoch, cfg.total_epochs, batch, cls_num, i, patch_num*2, loss)) if (batch+1)%100==0: if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) if (batch + 1) % 300 == 0: avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) model.train() HEAD1.train() #5.test avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) # 6.save model if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.cls_num = len(self.lb2idxs) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('Compute center feature'): self.center_fea = np.zeros((self.cls_num, self.features.shape[1])) for i in range(self.cls_num): self.center_fea[i] = np.mean(self.features[self.lb2idxs[i]], 0) self.center_fea = l2norm(self.center_fea) with Timer('read knn graph'): if os.path.isfile(knn_graph_path): print("load knns from the knn_path") self.knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) self.knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(self.knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) #print('adj before norm') #print(adj) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(self.knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat))