def run_lca_alg2(G, best_clustering, exp_alt_clustering, msg, trace_on=False): exp_alt_node2cid = ct.build_node_to_cluster_mapping(exp_alt_clustering) exp_alt_score = ct.clustering_score(G, exp_alt_node2cid) best_node2cid = ct.build_node_to_cluster_mapping(best_clustering) alt_clustering, alt_score = lca_alg2(G, best_clustering, best_node2cid, trace_on=trace_on) failed = False if not ct.same_clustering(alt_clustering, exp_alt_clustering): failed = True logger.info('%s FAILED' % (msg, )) else: logger.info('%s success' % (msg, )) if alt_score != exp_alt_score: failed = True logger.info('score %d, expected_score %d. FAILED' % (alt_score, exp_alt_score)) if failed: logger.info('current structures with failure:') alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering) ct.print_structures(G, alt_clustering, alt_node2cid, alt_score)
def test_comparisons(): """""" cids = list(ct.cids_from_range(99)) gt = { cids[0]: {'a', 'b'}, cids[3]: {'c'}, cids[4]: {'d', 'e'}, cids[6]: {'f', 'g', 'h'}, cids[8]: {'i', 'j', 'k', 'l', 'm'}, cids[10]: {'o'}, cids[13]: {'p', 'q'}, cids[15]: {'r', 's', 't'}, cids[16]: {'u', 'v', 'w'}, cids[19]: {'y', 'z', 'aa'}, } gt_n2c = ct.build_node_to_cluster_mapping(gt) est = { cids[25]: {'y', 'z', 'aa'}, cids[29]: {'u', 'v'}, cids[31]: {'w', 'r', 's', 't'}, cids[37]: {'p'}, cids[41]: {'q', 'o', 'm'}, cids[43]: {'i', 'j', 'k', 'l'}, cids[47]: {'a', 'b'}, cids[53]: {'c'}, cids[59]: {'d', 'e'}, cids[61]: {'f', 'g', 'h'}, } est_n2c = ct.build_node_to_cluster_mapping(est) logger.info('================') logger.info('test_comparisons') logger.info('ct.compare_by_lengths') ct.compare_by_lengths(est, est_n2c, gt) logger.info('Output for this example should be:\n' '1, 2, 1, 0.50, 0.667\n' '2, 3, 2, 0.67, 0.833\n' '3, 4, 2, 0.50, 0.854\n' '5, 1, 0, 0.00, 0.800') logger.info('------') logger.info('ct.pairwise_eval') # result = ct.compare_to_ground_truth(est, est_n2c, gt, gt_n2c) result = ct.percent_and_PR(est, est_n2c, gt, gt_n2c) logger.info('Result is [%1.3f, %1.3f, %1.3f]' % tuple(result)) num_clusters = len(est) num_correct = 5 tp, fp, fn = 18, 6, 7 precision = tp / (tp + fp) recall = tp / (tp + fn) logger.info('Should be [%1.3f, %1.3f, %1.3f]' % (num_correct / num_clusters, precision, recall))
def test_lca_alg1_constrained(): logger.info('\n=========================\n' 'Test lca_alg1_constrained\n' '=========================') G = tct.ex_graph_fig1() G['g']['j']['weight'] = -4 # a little larger than original to break a tie in_same = [('f', 'i')] in_different = [('d', 'e')] clustering, score = lca_alg1_constrained(G, in_same, in_different) node2cid = ct.build_node_to_cluster_mapping(clustering) correct_score = ct.clustering_score(G, node2cid) exp_clustering = { 0: {'a', 'b', 'd'}, 1: {'f', 'g', 'h', 'i', 'k'}, 2: {'c'}, 3: {'e'}, 4: {'j'}, } is_same = ct.same_clustering(clustering, exp_clustering, output_differences=True) if is_same: logger.info('constrained (d,e) different and (f,i) same: success') else: logger.info('constrained (d,e) different and (f,i) same: FAIL') if score != correct_score: logger.info('scoring error: actual %a, correct %a' % (score, correct_score)) else: logger.info('scoring correct: actual %a, correct %a' % (score, correct_score))
def test_count_equal(): """""" cids = list(ct.cids_from_range(99)) gt = { cids[0]: {'a', 'b'}, cids[3]: {'c'}, cids[4]: {'d', 'e'}, cids[6]: {'f', 'g', 'h'}, cids[8]: {'i', 'j', 'k', 'l', 'm'}, cids[10]: {'o'}, cids[13]: {'p', 'q'}, cids[15]: {'r', 's', 't'}, cids[16]: {'u', 'v', 'w'}, cids[19]: {'y', 'z', 'aa'}, } est = { cids[25]: {'y', 'z', 'aa'}, cids[29]: {'u', 'v'}, cids[31]: {'w', 'r', 's', 't'}, cids[37]: {'p'}, cids[41]: {'q', 'o', 'm'}, cids[43]: {'i', 'j', 'k', 'l'}, cids[47]: {'a', 'b'}, cids[53]: {'c'}, cids[59]: {'d', 'e'}, cids[61]: {'f', 'g', 'h'}, } est_n2c = ct.build_node_to_cluster_mapping(est) n = ct.count_equal_clustering(gt, est, est_n2c) logger.info('test_count_equal: should be 5 and is %s' % (n, ))
def commit_cluster_change(self, cc): """ Commit the cluster changes to the database. This involves updating the node to cluster id dictionary and the clustering dictionary. One way to do this would be to have special operations for each type of change. Instead, this function works generically except for the single case of no changes at all. """ if cc.change_type == 'Unchanged': return # 1. Add new clusters self.clustering.update(cc.new_clustering) # 2. Remove old clusters removed_cids = set(cc.old_clustering.keys()) - set( cc.new_clustering.keys()) for old_c in removed_cids: del self.clustering[old_c] # 3. Update the node to clusterid mapping new_node_to_cid = ct.build_node_to_cluster_mapping(cc.new_clustering) self.node_to_cid.update(new_node_to_cid) # 4. Removed nodes should have already been removed from the # db through the call to self.remove_nodes. for n in cc.removed_nodes: assert n not in self.node_to_cid
def run_ga_on_ccPIC(self, ccPIC_edges, ccPIC_clustering): gai = ga.graph_algorithm( ccPIC_edges, ccPIC_clustering.values(), self.ga_params['aug_names'], self.ga_params, self.edge_gen.edge_request_cb, self.edge_gen.edge_result_cb, ) """ Add call backs for removing nodes, pausing, getting intermediate results, and getting the status. """ gai.set_remove_nodes_cb(self.edge_gen.remove_nodes_cb) """ Could add other callbacks, such as gai.set_status_check_cbs(...) # Get GA status. Details TBD gai.set_result_cbs(...) # Get current clustering gai.set_log_contents_cbs(...) # """ """ This runs the main loop 10 iterations at a time in a while loop. Currently, it is written to run synchronously, but of course it will eventually run asychronously and therefore the callbacks will be used to feed it informationa and get intermediate results. """ iter_num = 0 converged = False paused = False while not converged: num_iter_to_run = 10 paused, iter_num, converged = gai.run_main_loop( iter_num, iter_num + num_iter_to_run ) """ Compute and then return the final information - the changes to the clusters. """ ccPIC_n2c = ct.build_node_to_cluster_mapping(ccPIC_clustering) changes = compare_clusterings.find_changes( ccPIC_clustering, ccPIC_n2c, gai.clustering, gai.node2cid, ) logger.info('') logger.info('*********************************') logger.info('After LCA convergence on ccPIC, here are the cluster changes:') for i, cc in enumerate(changes): logger.info('Change %d' % i) cc.log_change() logger.info('') return changes
def test_build_clustering_and_mapping(): logger.info('==================') logger.info('Testing build_clustering') empty_n2c = {} empty_clustering = ct.build_clustering(empty_n2c) logger.info( 'Empty node 2 cluster mapping should produce empty clustering %s' % (empty_clustering, )) # G = ex_graph_fig1() n2c_optimal = { 'a': '0', 'b': '0', 'd': '0', 'e': '0', 'c': '1', 'h': '2', 'i': '2', 'f': '3', 'g': '3', 'j': '3', 'k': '3', } clustering = ct.build_clustering(n2c_optimal) logger.info("Cluster 0 should be ['a', 'b', 'd', 'e']. It is %s" % (sorted(clustering['0']), )) logger.info("Cluster 1 should be ['c']. It is %s" % (sorted(clustering['1']), )) logger.info("Cluster 2 should be ['h', 'i']. It is %s" % (sorted(clustering['2']), )) logger.info("Cluster 3 should be ['f', 'g', 'j', 'k']. It is %s" % (sorted(clustering['3']), )), logger.info('==================') logger.info('Testing build_node_to_cluster_mapping') empty_clustering = {} empty_n2c = ct.build_node_to_cluster_mapping(empty_clustering) logger.info( 'Empty clustering should produce empty node-to-cluster mapping %s' % (empty_n2c, )) n2c_rebuilt = ct.build_node_to_cluster_mapping(clustering) logger.info( 'After rebuilding the node2cid mapping should be the same. Is it? %s' % (n2c_optimal == n2c_rebuilt, ))
def __init__(self, edges, clusters, aug_names, params, aug_request_cb, aug_result_cb): self.params = params logger.info('======================================') logger.info('Construction of graph_algorithm object') logger.info(dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) self.weight_mgr = wm.weight_manager(aug_names, params['tries_before_edge_done'], aug_request_cb, aug_result_cb) self.G = nx.Graph() weighted_edges = self.weight_mgr.get_initial_edges(edges) self.G.add_weighted_edges_from(weighted_edges) logger.info('Initial graph has %d nodes and %d edges' % (len(self.G.nodes), len(self.G.edges))) self._next_cid = 0 self.build_clustering(clusters) self.node2cid = ct.build_node_to_cluster_mapping(self.clustering) self.score = ct.clustering_score(self.G, self.node2cid) self.phase = 'scoring' self.cid2lca = cid_to_lca.CID2LCA() self.queues = lca_queues.lca_queues() self.new_lcas(self.clustering.keys(), use_pairs=True, use_singles=False) if self.queues.num_lcas() == 0: logger.info("Phase shift immediately into 'splitting'") self.phase = 'splitting' self.new_lcas(self.clustering.keys(), use_pairs=False, use_singles=True) self.queues.info_long(max_entries=10) self.num_verifier_results = 0 self.num_human_results = 0 self.removed_nodes = set() self.draw_obj = None if self.params['draw_iterations']: self.draw_obj = draw_lca.draw_lca(self.params['drawing_prefix']) """ Need to set these callbacks to request and receive information from the verfication algorithm and to do the same from human reviewers. """ self.remove_nodes_cb = None self.status_request_cb = None self.status_return_cb = None self.results_request_cb = None self.results_return_cb = None self.log_request_cb = None self.log_return_cb = None self.trace_start_human_gt_cb = None self.trace_iter_compare_to_gt_cb = None self.should_stop_cb = None logger.info('Completed graph algorithm initialization')
def __init__(self, subG, clustering, cids, score): self.subgraph = subG # Restricted to the clustering self.from_clusters = {c: clustering[c] for c in cids} self.from_cids_sorted = tuple(sorted(cids)) self.__hash_value = hash(self.from_cids_sorted) self.from_n2c = ct.build_node_to_cluster_mapping(self.from_clusters) self.from_score = score self.to_clusters = None self.to_score = None self.to_n2c = None self.inconsistent = []
def best_alternative_len2(G, clustering, node2cid): """Return the best alternative to the current clustering when G has exactly two nodes. """ if len(clustering) == 2: alt_clustering = {0: set(G.nodes())} else: alt_clustering = {c: {n} for c, n in enumerate(G.nodes())} alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering) alt_score = ct.clustering_score(G, alt_node2cid) return alt_clustering, alt_score
def test_find_changes(): logger.info('\ntest_find_changes:') old_clustering = { 0: set(['e']), 1: set(['f', 'g']), 2: set(['h', 'i']), 3: set(['j', 'k']), 4: set(['l']), 5: set(['m', 'n', 'o', 'p']), 6: set(['q']), 7: set(['r', 's']), 8: set(['t', 'u']), } old_n2c = ct.build_node_to_cluster_mapping(old_clustering) new_clustering = { 100: set(['a', 'b']), 101: set(['f', 'g']), 102: set(['h', 'c']), 103: set(['j', 'k', 'l', 'd']), 104: set(['m']), 105: set(['n']), 106: set(['o', 'p']), 107: set(['q', 'r']), 108: set(['s', 't', 'u', 'x', 'y']), } new_n2c = ct.build_node_to_cluster_mapping(new_clustering) correct_types = [ 'Removed', 'Unchanged', 'Extension', 'Merge', 'Split', 'Merge/Split', 'New', ] changes = find_changes(old_clustering, old_n2c, new_clustering, new_n2c) for c, t in zip(changes, correct_types): logger.info('..........') c.print_it() logger.info('Correct change type? %s' % (t == c.change_type, ))
def run_lca_alg1(G, expected_clustering, msg, stop_at_two=False, trace_on=False): node2cid = ct.build_node_to_cluster_mapping(expected_clustering) expected_score = ct.clustering_score(G, node2cid) clustering, score = lca_alg1(G, stop_at_two=stop_at_two, trace_on=trace_on) failed = False if not ct.same_clustering(clustering, expected_clustering): failed = True logger.info('%s FAILED' % (msg, )) else: logger.info('%s success' % (msg, )) if score != expected_score: failed = True logger.info('score %d, expected_score %d. FAILED' % (score, expected_score)) if failed: logger.info('current structures with failure:') node2cid = ct.build_node_to_cluster_mapping(clustering) ct.print_structures(G, clustering, node2cid, score)
def test_best_shift(trace_on=False): G = nx.Graph() logger.info('==================') logger.info('Testing best_shift') logger.info('==================') """ For this test, leaving out ('c', 'e', 4), the edge to be added and leaving out ('d', 'e', 3), which should be added later. """ G.add_weighted_edges_from([ ('a', 'b', 9), ('a', 'e', -2), ('b', 'c', -6), ('b', 'e', 5), ('b', 'f', -2), ('c', 'd', 7), ('d', 'f', -2), ('e', 'f', 6), ('d', 'g', -3), ('f', 'g', 4), ]) clustering = {0: {'a', 'b', 'e', 'f', 'g'}, 1: {'c', 'd'}} node2cid = ct.build_node_to_cluster_mapping(clustering) n0, n1 = 'e', 'c' # from biggest set to smaller delta, to_move = best_shift(n0, n1, G, clustering, node2cid) exp_delta = -12 exp_move = ['e', 'f', 'g'] if exp_delta != delta or set(exp_move) != set(to_move): logger.info('Test 1 (larger to smaller): FAIL') logger.info(' delta %a, to_move %a' % (delta, sorted(to_move))) logger.info(" should be -12 and ['e', 'f', 'g']") else: logger.info('Test 1 (larger to smaller): success') n0, n1 = 'c', 'e' # from biggest set to smaller delta, to_move = best_shift(n0, n1, G, clustering, node2cid) exp_delta = -26 exp_move = ['c'] if exp_delta != delta or set(exp_move) != set(to_move): logger.info('Test 2 (smaller to larger): FAIL') logger.info('delta %a, to_move %a' % (delta, sorted(to_move))) logger.info("should be -26 and ['c']") else: logger.info('Test 2 (smaller to larger): success')
def one_iteration(self, num_human): orig_edges = [e for e in self.edges_by_abs_wgt[num_human:] if e[2] > 0] human_prs = [(e[0], e[1]) for e in self.edges_by_abs_wgt[:num_human]] human_edges = [(pr[0], pr[1], self.dict_human[pr]) for pr in human_prs] human_edges = [e for e in human_edges if e[2] > 0] # logger.info("\n--------") # logger.info("orig_edges:", orig_edges) # logger.info("human_edges:", human_edges) edges = orig_edges + human_edges new_G = nx.Graph() new_G.add_nodes_from(self.nodes) new_G.add_weighted_edges_from(edges) idx = 0 clustering = dict() for cc in nx.connected_components(new_G): # logger.info("idx =", idx, "cc =", list(cc)) clustering[idx] = set(cc) idx += 1 node2cid = ct.build_node_to_cluster_mapping(clustering) return clustering, node2cid
def lca_alg1_constrained(curr_G, in_same=[], in_different=[], trace_on=False): """ Use algorithm 1 to find the best clustering of the current subgraph subject to the constraints that all pairs of nodes from in_same must be in the same cluster and all pairs of nodes from in_different must be in different clusters. This does not check that the constraints from in_same and in_different can all be satisfied. In implementation the in_same constraints take precedence, but in use, one of the two in_same and in_different lists will be empty. """ clustering = build_initial_from_constraints(curr_G, in_same) node2cid = ct.build_node_to_cluster_mapping(clustering) neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True) G_prime = nx.Graph() G_prime.add_nodes_from(curr_G) G_prime.add_weighted_edges_from(neg_edges) edges = [(p[0], p[1], curr_G[p[0]][p[1]]['weight']) for p in in_same] G_prime.add_weighted_edges_from(edges) score = ct.clustering_score(G_prime, node2cid) if trace_on: logger.info('=================================') logger.info('===== lca_alg1_constrained ====') logger.info('=================================') ct.print_structures(G_prime, clustering, node2cid, score) for e in pos_edges: if trace_on: logger.info('=======================') logger.info('Start of next iteration') logger.info('=======================') if e[0] < e[1]: n0, n1 = e[0], e[1] else: n1, n0 = e[0], e[1] if (n0, n1) in in_same: if trace_on: logger.info('Skipping (%a, %a) because already in graph' % (n0, n1)) continue wgt = e[2] n0_cid, n1_cid = node2cid[n0], node2cid[n1] if trace_on: logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' % (n0, n1, wgt, n0_cid, n1_cid)) if n0_cid == n1_cid: if trace_on: logger.info('Already in the same cluster') score += wgt elif keep_separate(clustering[n0_cid], clustering[n1_cid], in_different): if trace_on: logger.info('Must be kept separate') score -= wgt elif not ct.has_edges_between_them(G_prime, clustering[n0_cid], clustering[n1_cid]): if trace_on: logger.info('Merging disjoint clusters') sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) assert sc_delta == 0 score += sc_delta + wgt else: sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime, clustering) + wgt) if trace_on: logger.info('sc_merged=%a' % sc_merged) sc_unmerged = -wgt if trace_on: logger.info('sc_unmerged=%a' % sc_unmerged) if sc_merged > sc_unmerged: ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) score += sc_merged if trace_on: logger.info('Merging clusters with edges between') else: score += sc_unmerged if trace_on: logger.info('No merge of clusters with edges between ') G_prime.add_weighted_edges_from([e]) if trace_on: ct.print_structures(G_prime, clustering, node2cid, score) return clustering, score
def __init__(self, edges, clustering): super().__init__() self.edge_graph = nx.Graph() self.add_edges(edges) self.clustering = clustering self.node_to_cid = ct.build_node_to_cluster_mapping(self.clustering)
def lca_alg2(G, clustering, node2cid, trace_on=False): """ If it is a single cluster, then stop the original algorithm when there are two clusters. Perhaps can run alternative multiple times If there are multiple clusterings, then one option is a merge, but add others based on inconsistency Don't allow len(G) <= 1it is two, and the nodes are disconnected, there is also no alternative. If it is two, then split/merging vs. merging/splitting is the alternative. """ assert len(G) >= 2 if len(G) == 2: return best_alternative_len2(G, clustering, node2cid) """ Form the first estimate of the best alternative. If there is just one cluster in the current (local) best clustering then rerun Alg1 constrained to stop at at most two. Otherwise, just form a single clustering. """ if len(clustering) == 1: best_clustering, best_score = a1.lca_alg1(G, stop_at_two=True) best_node2cid = ct.build_node_to_cluster_mapping(best_clustering) else: best_clustering = {0: set(G.nodes())} best_node2cid = {n: 0 for n in G.nodes()} best_score = ct.clustering_score(G, best_node2cid) if trace_on: logger.info('In lca_alg2, before checking inconsistent\n' 'best_clustering %a, best_score %d, checking %d' % (best_clustering, best_score, ct.clustering_score(G, best_node2cid))) inconsistent = inconsistent_edges(G, clustering, node2cid) inconsistent.sort(key=lambda e: abs(e[2]), reverse=True) if trace_on: logger.info('In lca_alg2: clustering %s' % (clustering, )) logger.info('In lca_alg2: inconsistent edges %s' % (inconsistent, )) logger.info('Starting inconsistent edge loop') for e in inconsistent: if trace_on: logger.info('e = %s' % (e, )) if e[2] < 0: if trace_on: logger.info('Forcing edge into different clusters') new_clustering, new_score = lca_alg1_constrained(G, in_same=[], in_different=[ (e[0], e[1]) ]) else: if trace_on: logger.info('Forcing edge into same cluster') new_clustering, new_score = lca_alg1_constrained(G, in_same=[(e[0], e[1])], in_different=[]) if trace_on: logger.info('Best score returned by lca_alg1_constrained is %s' % (new_score, )) logger.info( 'Checking', ct.clustering_score( G, ct.build_node_to_cluster_mapping(new_clustering)), ) if new_score > best_score: if trace_on: logger.info('New best') best_score = new_score best_clustering = new_clustering return best_clustering, best_score
def lca_alg1(curr_G, stop_at_two=False, trace_on=False): if len(curr_G) == 0: return {}, 0 elif len(curr_G) == 1: clustering = {0: set(curr_G.nodes())} return clustering, 0 neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True) clustering = {c: {n} for c, n in enumerate(sorted(curr_G.nodes()))} node2cid = ct.build_node_to_cluster_mapping(clustering) G_prime = nx.Graph() G_prime.add_nodes_from(curr_G) G_prime.add_weighted_edges_from(neg_edges) score = ct.clustering_score(G_prime, node2cid) if trace_on: logger.info('====================') logger.info('==== lca_alg1 ====') logger.info('====================') ct.print_structures(G_prime, clustering, node2cid, score) for e in pos_edges: if trace_on: logger.info('=======================') logger.info('Start of next iteration') logger.info('=======================') if e[0] < e[1]: n0, n1 = e[0], e[1] else: n1, n0 = e[0], e[1] wgt = e[2] n0_cid, n1_cid = node2cid[n0], node2cid[n1] if trace_on: logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' % (n0, n1, wgt, n0_cid, n1_cid)) is_merge_allowed = not stop_at_two or len(clustering) > 2 if trace_on: logger.info('is_merge_allowed %s' % (is_merge_allowed, )) if n0_cid == n1_cid: if trace_on: logger.info('In the same cluster') score += wgt elif is_merge_allowed and not ct.has_edges_between_them( G_prime, clustering[n0_cid], clustering[n1_cid]): if trace_on: logger.info('Merging disjoint clusters') sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) assert sc_delta == 0 score += sc_delta + wgt # why might sc_delta be non-zero here??? else: sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime, clustering) + wgt) if trace_on: logger.info('sc_merged=%a' % sc_merged) sc_unmerged = -wgt if trace_on: logger.info('sc_unmerged=%a' % sc_unmerged) if len(clustering[n0_cid]) == 1 or len(clustering[n1_cid]) == 1: sc_n0_to_n1 = sc_n1_to_n0 = min(sc_merged, sc_unmerged) - 9999 n0_to_move = n1_to_move = [] if trace_on: logger.info('not checking moving nodes because ' 'at least one cluster is length 1') else: sc_n0_to_n1, n0_to_move = best_shift(n0, n1, G_prime, clustering, node2cid, trace_on=trace_on) sc_n0_to_n1 += wgt if trace_on: logger.info('sc_n0_to_n1=%a, n0_to_move=%a' % (sc_n0_to_n1, n0_to_move)) sc_n1_to_n0, n1_to_move = best_shift(n1, n0, G_prime, clustering, node2cid, trace_on=trace_on) sc_n1_to_n0 += wgt if trace_on: logger.info('sc_n1_to_n0=%a, n1_to_move=%a' % (sc_n1_to_n0, n1_to_move)) if is_merge_allowed and sc_merged >= max(sc_unmerged, sc_n0_to_n1, sc_n1_to_n0): ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering, node2cid) score += sc_merged if trace_on: logger.info('Choose merge') elif sc_unmerged >= max(sc_n0_to_n1, sc_n1_to_n0): score += sc_unmerged if trace_on: logger.info('Choose unmerged - unchanged') elif sc_n0_to_n1 >= sc_n1_to_n0: ct.shift_between_clusters(n0_cid, n0_to_move, n1_cid, clustering, node2cid) score += sc_n0_to_n1 if trace_on: logger.info('Choose to shift from cluster %a to %a' % (n0_cid, n1_cid)) else: ct.shift_between_clusters(n1_cid, n1_to_move, n0_cid, clustering, node2cid) score += sc_n1_to_n0 if trace_on: logger.info('Choose to shift from cluster %a to %a' % (n1_cid, n0_cid)) G_prime.add_weighted_edges_from([e]) if trace_on: ct.print_structures(G_prime, clustering, node2cid, score) return clustering, score
def generate(self): expected_nodes = 1 + self.params['gamma_shape'] * self.params['gamma_scale'] digits_per_node = 2 + int(m.log10(expected_nodes)) next_index = 0 nodes = [] # list of node ids edges = [] # list of edge 3-tuples num_correct_positive = 0 num_correct_negative = 0 num_correct_zero = 0 num_incorrect_positive = 0 num_incorrect_negative = 0 num_incorrect_zero = 0 """ Step 0: """ samples = np.random.gamma( self.params['gamma_shape'], self.params['gamma_scale'], self.params['num_clusters'], ) samples = 1 + np.round(samples) samples = samples.astype(int) """ Step 1: Generate the clusters, the nodes within the cluster, and the "correct" inter-cluster edges. Note that since we are assuming an imperfect ranking algorithm, this does not ensure that each cluster is connected. """ num_from_ranker = self.params['num_from_ranker'] cids = ct.cids_from_range(len(samples), prefix='ct') for i, cid in enumerate(cids): self.gt_clustering[cid] = list() n = samples[i] # Create the nodes in the cluster for i in range(n): node_id = 'n' + str(next_index).zfill(digits_per_node) next_index += 1 nodes.append(node_id) self.gt_clustering[cid].append(node_id) self.gt_node2cid[node_id] = cid self.ranker_matches[node_id] = set() # Create the positive edges between nodes in a cluster. # These are symmetric. Don't allow more than num_from_ranker # matches / edges for any node. for i, ith_node in enumerate(self.gt_clustering[cid]): for j in range(i + 1, len(self.gt_clustering[cid])): prob = random.uniform(0, 1) jth_node = self.gt_clustering[cid][j] if prob < self.params['p_ranker_correct'] and \ len(self.ranker_matches[ith_node]) < num_from_ranker and \ len(self.ranker_matches[jth_node]) < num_from_ranker: self.ranker_matches[ith_node].add(jth_node) self.ranker_matches[jth_node].add(ith_node) is_match_correct = True wgt = self.wgtr.random_wgt(is_match_correct) if wgt > 0: num_correct_positive += 1 elif wgt == 0: num_correct_zero += 1 else: num_correct_negative += 1 e = (ith_node, jth_node, wgt) edges.append(e) assert num_from_ranker > 0 num_nodes = len(nodes) # Change the list to a set self.gt_clustering = { cid: set(cluster) for cid, cluster in self.gt_clustering.items() } """ Step 2: Generate "incorrect" match edges, sufficient to have the required number of edges generated by the ranking algorithm. """ for i, ith_node in enumerate(nodes): matches = self.ranker_matches[ith_node] cid = self.gt_node2cid[ith_node] cluster = set(self.gt_clustering[cid]) """ Generate (incorrect) edges between clusters """ is_match_correct = False while len(matches) < num_from_ranker: j = random.randint(0, num_nodes - 1) jth_node = nodes[j] if jth_node not in matches and jth_node not in cluster: matches.add(jth_node) wgt = self.wgtr.random_wgt(is_match_correct) if wgt > 0: num_incorrect_positive += 1 elif wgt == 0: num_incorrect_zero += 1 else: num_incorrect_negative += 1 if ith_node < jth_node: e = (ith_node, jth_node, wgt) else: e = (jth_node, ith_node, wgt) edges.append(e) self.G.add_weighted_edges_from(edges) logging.info('simulator::generate: adding %d edges' % len(edges)) logging.info('%d correct match edges have positive weight' % num_correct_positive) logging.info('%d correct match edges have zero weight' % num_correct_zero) logging.info('%d correct match edges have negative weight' % num_correct_negative) logging.info( '%d incorrect match edges have positive weight' % num_incorrect_positive ) logging.info('%d incorrect match edges have zero weight' % num_incorrect_zero) logging.info( '%d incorrect match edges have negative weight' % num_incorrect_negative ) self.G_orig.add_nodes_from(self.G) self.G_orig.add_weighted_edges_from(edges) """ Step 3: Generate the "reachable" ground truth, the obtainable result given simulated failures to match that could disconnect a correct match. """ self.r_clustering = dict() k = 0 for cc in self.gt_clustering.values(): H = self.G.subgraph(cc) prev_k = k for new_cc in nx.connected_components(H): self.r_clustering[k] = new_cc k += 1 if k - prev_k > 1: logger.info('GT cluster %a split into %a ...' % (cc, k - prev_k)) for i in range(prev_k, k): logger.info(' %a' % self.r_clustering[i]) else: logger.info('GT cluster %a is intact' % cc) self.r_node2cid = ct.build_node_to_cluster_mapping(self.r_clustering) """ Step 4: Reconfigure edges to maks the expected input to the graph algorithm weight manager. """ aug_names = ['verifier', 'human'] edges = [(n0, n1, w, aug_names[0]) for n0, n1, w in edges] return edges, aug_names
def set_to_clusters(self, to_clusters, to_score): self.to_clusters = to_clusters self.to_score = to_score self.to_n2c = ct.build_node_to_cluster_mapping(self.to_clusters) self.inconsistent = []