Exemple #1
0
def run_lca_alg2(G, best_clustering, exp_alt_clustering, msg, trace_on=False):
    exp_alt_node2cid = ct.build_node_to_cluster_mapping(exp_alt_clustering)
    exp_alt_score = ct.clustering_score(G, exp_alt_node2cid)

    best_node2cid = ct.build_node_to_cluster_mapping(best_clustering)
    alt_clustering, alt_score = lca_alg2(G,
                                         best_clustering,
                                         best_node2cid,
                                         trace_on=trace_on)

    failed = False
    if not ct.same_clustering(alt_clustering, exp_alt_clustering):
        failed = True
        logger.info('%s FAILED' % (msg, ))
    else:
        logger.info('%s success' % (msg, ))

    if alt_score != exp_alt_score:
        failed = True
        logger.info('score %d, expected_score %d. FAILED' %
                    (alt_score, exp_alt_score))

    if failed:
        logger.info('current structures with failure:')
        alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering)
        ct.print_structures(G, alt_clustering, alt_node2cid, alt_score)
def test_comparisons():
    """"""
    cids = list(ct.cids_from_range(99))
    gt = {
        cids[0]: {'a', 'b'},
        cids[3]: {'c'},
        cids[4]: {'d', 'e'},
        cids[6]: {'f', 'g', 'h'},
        cids[8]: {'i', 'j', 'k', 'l', 'm'},
        cids[10]: {'o'},
        cids[13]: {'p', 'q'},
        cids[15]: {'r', 's', 't'},
        cids[16]: {'u', 'v', 'w'},
        cids[19]: {'y', 'z', 'aa'},
    }
    gt_n2c = ct.build_node_to_cluster_mapping(gt)

    est = {
        cids[25]: {'y', 'z', 'aa'},
        cids[29]: {'u', 'v'},
        cids[31]: {'w', 'r', 's', 't'},
        cids[37]: {'p'},
        cids[41]: {'q', 'o', 'm'},
        cids[43]: {'i', 'j', 'k', 'l'},
        cids[47]: {'a', 'b'},
        cids[53]: {'c'},
        cids[59]: {'d', 'e'},
        cids[61]: {'f', 'g', 'h'},
    }
    est_n2c = ct.build_node_to_cluster_mapping(est)

    logger.info('================')
    logger.info('test_comparisons')
    logger.info('ct.compare_by_lengths')

    ct.compare_by_lengths(est, est_n2c, gt)

    logger.info('Output for this example should be:\n'
                '1, 2, 1, 0.50, 0.667\n'
                '2, 3, 2, 0.67, 0.833\n'
                '3, 4, 2, 0.50, 0.854\n'
                '5, 1, 0, 0.00, 0.800')

    logger.info('------')
    logger.info('ct.pairwise_eval')
    # result = ct.compare_to_ground_truth(est, est_n2c, gt, gt_n2c)
    result = ct.percent_and_PR(est, est_n2c, gt, gt_n2c)
    logger.info('Result is [%1.3f, %1.3f, %1.3f]' % tuple(result))
    num_clusters = len(est)
    num_correct = 5
    tp, fp, fn = 18, 6, 7
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    logger.info('Should be [%1.3f, %1.3f, %1.3f]' %
                (num_correct / num_clusters, precision, recall))
Exemple #3
0
def test_lca_alg1_constrained():
    logger.info('\n=========================\n'
                'Test lca_alg1_constrained\n'
                '=========================')
    G = tct.ex_graph_fig1()
    G['g']['j']['weight'] = -4  # a little larger than original to break a tie
    in_same = [('f', 'i')]
    in_different = [('d', 'e')]
    clustering, score = lca_alg1_constrained(G, in_same, in_different)
    node2cid = ct.build_node_to_cluster_mapping(clustering)
    correct_score = ct.clustering_score(G, node2cid)

    exp_clustering = {
        0: {'a', 'b', 'd'},
        1: {'f', 'g', 'h', 'i', 'k'},
        2: {'c'},
        3: {'e'},
        4: {'j'},
    }
    is_same = ct.same_clustering(clustering,
                                 exp_clustering,
                                 output_differences=True)
    if is_same:
        logger.info('constrained (d,e) different and (f,i) same: success')
    else:
        logger.info('constrained (d,e) different and (f,i) same: FAIL')

    if score != correct_score:
        logger.info('scoring error:  actual %a, correct %a' %
                    (score, correct_score))
    else:
        logger.info('scoring correct:  actual %a, correct %a' %
                    (score, correct_score))
def test_count_equal():
    """"""
    cids = list(ct.cids_from_range(99))
    gt = {
        cids[0]: {'a', 'b'},
        cids[3]: {'c'},
        cids[4]: {'d', 'e'},
        cids[6]: {'f', 'g', 'h'},
        cids[8]: {'i', 'j', 'k', 'l', 'm'},
        cids[10]: {'o'},
        cids[13]: {'p', 'q'},
        cids[15]: {'r', 's', 't'},
        cids[16]: {'u', 'v', 'w'},
        cids[19]: {'y', 'z', 'aa'},
    }

    est = {
        cids[25]: {'y', 'z', 'aa'},
        cids[29]: {'u', 'v'},
        cids[31]: {'w', 'r', 's', 't'},
        cids[37]: {'p'},
        cids[41]: {'q', 'o', 'm'},
        cids[43]: {'i', 'j', 'k', 'l'},
        cids[47]: {'a', 'b'},
        cids[53]: {'c'},
        cids[59]: {'d', 'e'},
        cids[61]: {'f', 'g', 'h'},
    }

    est_n2c = ct.build_node_to_cluster_mapping(est)
    n = ct.count_equal_clustering(gt, est, est_n2c)
    logger.info('test_count_equal: should be 5 and is %s' % (n, ))
    def commit_cluster_change(self, cc):
        """
        Commit the cluster changes to the database. This involves
        updating the node to cluster id dictionary and the clustering
        dictionary. One way to do this would be to have special
        operations for each type of change. Instead, this function
        works generically except for the single case of no changes at
        all.
        """
        if cc.change_type == 'Unchanged':
            return

        # 1. Add new clusters
        self.clustering.update(cc.new_clustering)

        # 2. Remove old clusters
        removed_cids = set(cc.old_clustering.keys()) - set(
            cc.new_clustering.keys())
        for old_c in removed_cids:
            del self.clustering[old_c]

        # 3. Update the node to clusterid mapping
        new_node_to_cid = ct.build_node_to_cluster_mapping(cc.new_clustering)
        self.node_to_cid.update(new_node_to_cid)

        # 4. Removed nodes should have already been removed from the
        #    db through the call to self.remove_nodes.
        for n in cc.removed_nodes:
            assert n not in self.node_to_cid
Exemple #6
0
    def run_ga_on_ccPIC(self, ccPIC_edges, ccPIC_clustering):
        gai = ga.graph_algorithm(
            ccPIC_edges,
            ccPIC_clustering.values(),
            self.ga_params['aug_names'],
            self.ga_params,
            self.edge_gen.edge_request_cb,
            self.edge_gen.edge_result_cb,
        )

        """
        Add call backs for removing nodes, pausing, getting intermediate
        results, and getting the status.
        """
        gai.set_remove_nodes_cb(self.edge_gen.remove_nodes_cb)

        """
        Could add other callbacks, such as
        gai.set_status_check_cbs(...)  # Get GA status. Details TBD
        gai.set_result_cbs(...)  # Get current clustering
        gai.set_log_contents_cbs(...)  #
        """

        """
        This runs the main loop 10 iterations at a time in a while
        loop. Currently, it is written to run synchronously, but of course
        it will eventually run asychronously and therefore the callbacks
        will be used to feed it informationa and get intermediate results.
        """
        iter_num = 0
        converged = False
        paused = False
        while not converged:
            num_iter_to_run = 10
            paused, iter_num, converged = gai.run_main_loop(
                iter_num, iter_num + num_iter_to_run
            )

        """
        Compute and then return the final information - the changes to
        the clusters.
        """
        ccPIC_n2c = ct.build_node_to_cluster_mapping(ccPIC_clustering)
        changes = compare_clusterings.find_changes(
            ccPIC_clustering,
            ccPIC_n2c,
            gai.clustering,
            gai.node2cid,
        )

        logger.info('')
        logger.info('*********************************')
        logger.info('After LCA convergence on ccPIC, here are the cluster changes:')
        for i, cc in enumerate(changes):
            logger.info('Change %d' % i)
            cc.log_change()

        logger.info('')
        return changes
def test_build_clustering_and_mapping():
    logger.info('==================')
    logger.info('Testing build_clustering')
    empty_n2c = {}
    empty_clustering = ct.build_clustering(empty_n2c)
    logger.info(
        'Empty node 2 cluster mapping should produce empty clustering %s' %
        (empty_clustering, ))

    # G = ex_graph_fig1()
    n2c_optimal = {
        'a': '0',
        'b': '0',
        'd': '0',
        'e': '0',
        'c': '1',
        'h': '2',
        'i': '2',
        'f': '3',
        'g': '3',
        'j': '3',
        'k': '3',
    }

    clustering = ct.build_clustering(n2c_optimal)
    logger.info("Cluster 0 should be ['a', 'b', 'd', 'e']. It is %s" %
                (sorted(clustering['0']), ))
    logger.info("Cluster 1 should be ['c']. It is %s" %
                (sorted(clustering['1']), ))
    logger.info("Cluster 2 should be ['h', 'i']. It is %s" %
                (sorted(clustering['2']), ))
    logger.info("Cluster 3 should be ['f', 'g', 'j', 'k']. It is %s" %
                (sorted(clustering['3']), )),

    logger.info('==================')
    logger.info('Testing build_node_to_cluster_mapping')
    empty_clustering = {}
    empty_n2c = ct.build_node_to_cluster_mapping(empty_clustering)
    logger.info(
        'Empty clustering should produce empty node-to-cluster mapping %s' %
        (empty_n2c, ))

    n2c_rebuilt = ct.build_node_to_cluster_mapping(clustering)
    logger.info(
        'After rebuilding the node2cid mapping should be the same.  Is it? %s'
        % (n2c_optimal == n2c_rebuilt, ))
    def __init__(self, edges, clusters, aug_names, params, aug_request_cb,
                 aug_result_cb):
        self.params = params
        logger.info('======================================')
        logger.info('Construction of graph_algorithm object')
        logger.info(dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        self.weight_mgr = wm.weight_manager(aug_names,
                                            params['tries_before_edge_done'],
                                            aug_request_cb, aug_result_cb)
        self.G = nx.Graph()
        weighted_edges = self.weight_mgr.get_initial_edges(edges)
        self.G.add_weighted_edges_from(weighted_edges)
        logger.info('Initial graph has %d nodes and %d edges' %
                    (len(self.G.nodes), len(self.G.edges)))
        self._next_cid = 0
        self.build_clustering(clusters)
        self.node2cid = ct.build_node_to_cluster_mapping(self.clustering)
        self.score = ct.clustering_score(self.G, self.node2cid)

        self.phase = 'scoring'
        self.cid2lca = cid_to_lca.CID2LCA()
        self.queues = lca_queues.lca_queues()
        self.new_lcas(self.clustering.keys(),
                      use_pairs=True,
                      use_singles=False)
        if self.queues.num_lcas() == 0:
            logger.info("Phase shift immediately into 'splitting'")
            self.phase = 'splitting'
            self.new_lcas(self.clustering.keys(),
                          use_pairs=False,
                          use_singles=True)
        self.queues.info_long(max_entries=10)

        self.num_verifier_results = 0
        self.num_human_results = 0
        self.removed_nodes = set()

        self.draw_obj = None
        if self.params['draw_iterations']:
            self.draw_obj = draw_lca.draw_lca(self.params['drawing_prefix'])
        """  Need to set these callbacks to request and receive
        information from the verfication algorithm and to do the same
        from human reviewers. """
        self.remove_nodes_cb = None
        self.status_request_cb = None
        self.status_return_cb = None
        self.results_request_cb = None
        self.results_return_cb = None
        self.log_request_cb = None
        self.log_return_cb = None
        self.trace_start_human_gt_cb = None
        self.trace_iter_compare_to_gt_cb = None
        self.should_stop_cb = None
        logger.info('Completed graph algorithm initialization')
Exemple #9
0
 def __init__(self, subG, clustering, cids, score):
     self.subgraph = subG  # Restricted to the clustering
     self.from_clusters = {c: clustering[c] for c in cids}
     self.from_cids_sorted = tuple(sorted(cids))
     self.__hash_value = hash(self.from_cids_sorted)
     self.from_n2c = ct.build_node_to_cluster_mapping(self.from_clusters)
     self.from_score = score
     self.to_clusters = None
     self.to_score = None
     self.to_n2c = None
     self.inconsistent = []
Exemple #10
0
def best_alternative_len2(G, clustering, node2cid):
    """Return the best alternative to the current clustering when G has
    exactly two nodes.
    """
    if len(clustering) == 2:
        alt_clustering = {0: set(G.nodes())}
    else:
        alt_clustering = {c: {n} for c, n in enumerate(G.nodes())}
    alt_node2cid = ct.build_node_to_cluster_mapping(alt_clustering)
    alt_score = ct.clustering_score(G, alt_node2cid)
    return alt_clustering, alt_score
def test_find_changes():
    logger.info('\ntest_find_changes:')
    old_clustering = {
        0: set(['e']),
        1: set(['f', 'g']),
        2: set(['h', 'i']),
        3: set(['j', 'k']),
        4: set(['l']),
        5: set(['m', 'n', 'o', 'p']),
        6: set(['q']),
        7: set(['r', 's']),
        8: set(['t', 'u']),
    }
    old_n2c = ct.build_node_to_cluster_mapping(old_clustering)
    new_clustering = {
        100: set(['a', 'b']),
        101: set(['f', 'g']),
        102: set(['h', 'c']),
        103: set(['j', 'k', 'l', 'd']),
        104: set(['m']),
        105: set(['n']),
        106: set(['o', 'p']),
        107: set(['q', 'r']),
        108: set(['s', 't', 'u', 'x', 'y']),
    }
    new_n2c = ct.build_node_to_cluster_mapping(new_clustering)

    correct_types = [
        'Removed',
        'Unchanged',
        'Extension',
        'Merge',
        'Split',
        'Merge/Split',
        'New',
    ]
    changes = find_changes(old_clustering, old_n2c, new_clustering, new_n2c)
    for c, t in zip(changes, correct_types):
        logger.info('..........')
        c.print_it()
        logger.info('Correct change type? %s' % (t == c.change_type, ))
Exemple #12
0
def run_lca_alg1(G,
                 expected_clustering,
                 msg,
                 stop_at_two=False,
                 trace_on=False):
    node2cid = ct.build_node_to_cluster_mapping(expected_clustering)
    expected_score = ct.clustering_score(G, node2cid)
    clustering, score = lca_alg1(G, stop_at_two=stop_at_two, trace_on=trace_on)
    failed = False
    if not ct.same_clustering(clustering, expected_clustering):
        failed = True
        logger.info('%s FAILED' % (msg, ))
    else:
        logger.info('%s success' % (msg, ))

    if score != expected_score:
        failed = True
        logger.info('score %d, expected_score %d. FAILED' %
                    (score, expected_score))

    if failed:
        logger.info('current structures with failure:')
        node2cid = ct.build_node_to_cluster_mapping(clustering)
        ct.print_structures(G, clustering, node2cid, score)
Exemple #13
0
def test_best_shift(trace_on=False):
    G = nx.Graph()

    logger.info('==================')
    logger.info('Testing best_shift')
    logger.info('==================')
    """
    For this test, leaving out ('c', 'e', 4), the edge to be added
    and leaving out ('d', 'e', 3), which should be added later.
    """
    G.add_weighted_edges_from([
        ('a', 'b', 9),
        ('a', 'e', -2),
        ('b', 'c', -6),
        ('b', 'e', 5),
        ('b', 'f', -2),
        ('c', 'd', 7),
        ('d', 'f', -2),
        ('e', 'f', 6),
        ('d', 'g', -3),
        ('f', 'g', 4),
    ])

    clustering = {0: {'a', 'b', 'e', 'f', 'g'}, 1: {'c', 'd'}}
    node2cid = ct.build_node_to_cluster_mapping(clustering)

    n0, n1 = 'e', 'c'  # from biggest set to smaller
    delta, to_move = best_shift(n0, n1, G, clustering, node2cid)
    exp_delta = -12
    exp_move = ['e', 'f', 'g']
    if exp_delta != delta or set(exp_move) != set(to_move):
        logger.info('Test 1 (larger to smaller): FAIL')
        logger.info('    delta %a, to_move %a' % (delta, sorted(to_move)))
        logger.info("    should be -12 and ['e', 'f', 'g']")
    else:
        logger.info('Test 1 (larger to smaller): success')

    n0, n1 = 'c', 'e'  # from biggest set to smaller
    delta, to_move = best_shift(n0, n1, G, clustering, node2cid)
    exp_delta = -26
    exp_move = ['c']
    if exp_delta != delta or set(exp_move) != set(to_move):
        logger.info('Test 2 (smaller to larger): FAIL')
        logger.info('delta %a, to_move %a' % (delta, sorted(to_move)))
        logger.info("should be -26 and ['c']")
    else:
        logger.info('Test 2 (smaller to larger): success')
Exemple #14
0
    def one_iteration(self, num_human):
        orig_edges = [e for e in self.edges_by_abs_wgt[num_human:] if e[2] > 0]
        human_prs = [(e[0], e[1]) for e in self.edges_by_abs_wgt[:num_human]]
        human_edges = [(pr[0], pr[1], self.dict_human[pr]) for pr in human_prs]
        human_edges = [e for e in human_edges if e[2] > 0]
        # logger.info("\n--------")
        # logger.info("orig_edges:", orig_edges)
        # logger.info("human_edges:", human_edges)
        edges = orig_edges + human_edges
        new_G = nx.Graph()
        new_G.add_nodes_from(self.nodes)
        new_G.add_weighted_edges_from(edges)

        idx = 0
        clustering = dict()
        for cc in nx.connected_components(new_G):
            # logger.info("idx =", idx, "cc =", list(cc))
            clustering[idx] = set(cc)
            idx += 1

        node2cid = ct.build_node_to_cluster_mapping(clustering)
        return clustering, node2cid
Exemple #15
0
def lca_alg1_constrained(curr_G, in_same=[], in_different=[], trace_on=False):
    """
    Use algorithm 1 to find the best clustering of the current
    subgraph subject to the constraints that all pairs of nodes from
    in_same must be in the same cluster and all pairs of nodes from
    in_different must be in different clusters.

    This does not check that the constraints from in_same and
    in_different can all be satisfied. In implementation the in_same
    constraints take precedence, but in use, one of the two in_same
    and in_different lists will be empty.
    """
    clustering = build_initial_from_constraints(curr_G, in_same)
    node2cid = ct.build_node_to_cluster_mapping(clustering)

    neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True)
    G_prime = nx.Graph()
    G_prime.add_nodes_from(curr_G)
    G_prime.add_weighted_edges_from(neg_edges)

    edges = [(p[0], p[1], curr_G[p[0]][p[1]]['weight']) for p in in_same]
    G_prime.add_weighted_edges_from(edges)
    score = ct.clustering_score(G_prime, node2cid)

    if trace_on:
        logger.info('=================================')
        logger.info('=====  lca_alg1_constrained  ====')
        logger.info('=================================')
        ct.print_structures(G_prime, clustering, node2cid, score)

    for e in pos_edges:
        if trace_on:
            logger.info('=======================')
            logger.info('Start of next iteration')
            logger.info('=======================')

        if e[0] < e[1]:
            n0, n1 = e[0], e[1]
        else:
            n1, n0 = e[0], e[1]

        if (n0, n1) in in_same:
            if trace_on:
                logger.info('Skipping (%a, %a) because already in graph' %
                            (n0, n1))
            continue

        wgt = e[2]
        n0_cid, n1_cid = node2cid[n0], node2cid[n1]
        if trace_on:
            logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' %
                        (n0, n1, wgt, n0_cid, n1_cid))

        if n0_cid == n1_cid:
            if trace_on:
                logger.info('Already in the same cluster')
            score += wgt

        elif keep_separate(clustering[n0_cid], clustering[n1_cid],
                           in_different):
            if trace_on:
                logger.info('Must be kept separate')
            score -= wgt

        elif not ct.has_edges_between_them(G_prime, clustering[n0_cid],
                                           clustering[n1_cid]):
            if trace_on:
                logger.info('Merging disjoint clusters')
            sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering,
                                         node2cid)
            assert sc_delta == 0
            score += sc_delta + wgt

        else:
            sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime,
                                                    clustering) + wgt)
            if trace_on:
                logger.info('sc_merged=%a' % sc_merged)
            sc_unmerged = -wgt
            if trace_on:
                logger.info('sc_unmerged=%a' % sc_unmerged)

            if sc_merged > sc_unmerged:
                ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering,
                                  node2cid)
                score += sc_merged
                if trace_on:
                    logger.info('Merging clusters with edges between')
            else:
                score += sc_unmerged
                if trace_on:
                    logger.info('No merge of clusters with edges between ')

        G_prime.add_weighted_edges_from([e])
        if trace_on:
            ct.print_structures(G_prime, clustering, node2cid, score)

    return clustering, score
 def __init__(self, edges, clustering):
     super().__init__()
     self.edge_graph = nx.Graph()
     self.add_edges(edges)
     self.clustering = clustering
     self.node_to_cid = ct.build_node_to_cluster_mapping(self.clustering)
Exemple #17
0
def lca_alg2(G, clustering, node2cid, trace_on=False):
    """
    If it is a single cluster, then stop the original algorithm when
    there are two clusters.  Perhaps can run alternative multiple times

    If there are multiple clusterings, then one option is a merge, but
    add others based on inconsistency

    Don't allow len(G) <= 1it is two, and the
    nodes are disconnected, there is also no alternative.  If it is two,
    then split/merging vs. merging/splitting is the alternative.
    """
    assert len(G) >= 2

    if len(G) == 2:
        return best_alternative_len2(G, clustering, node2cid)
    """ Form the first estimate of the best alternative.  If there is just
        one cluster in the current (local) best clustering then rerun
        Alg1 constrained to stop at at most two.  Otherwise, just form
        a single clustering.
    """
    if len(clustering) == 1:
        best_clustering, best_score = a1.lca_alg1(G, stop_at_two=True)
        best_node2cid = ct.build_node_to_cluster_mapping(best_clustering)
    else:
        best_clustering = {0: set(G.nodes())}
        best_node2cid = {n: 0 for n in G.nodes()}
        best_score = ct.clustering_score(G, best_node2cid)

    if trace_on:
        logger.info('In lca_alg2, before checking inconsistent\n'
                    'best_clustering %a, best_score %d, checking %d' %
                    (best_clustering, best_score,
                     ct.clustering_score(G, best_node2cid)))

    inconsistent = inconsistent_edges(G, clustering, node2cid)
    inconsistent.sort(key=lambda e: abs(e[2]), reverse=True)
    if trace_on:
        logger.info('In lca_alg2: clustering %s' % (clustering, ))
        logger.info('In lca_alg2: inconsistent edges %s' % (inconsistent, ))
        logger.info('Starting inconsistent edge loop')

    for e in inconsistent:
        if trace_on:
            logger.info('e = %s' % (e, ))
        if e[2] < 0:
            if trace_on:
                logger.info('Forcing edge into different clusters')
            new_clustering, new_score = lca_alg1_constrained(G,
                                                             in_same=[],
                                                             in_different=[
                                                                 (e[0], e[1])
                                                             ])
        else:
            if trace_on:
                logger.info('Forcing edge into same cluster')
            new_clustering, new_score = lca_alg1_constrained(G,
                                                             in_same=[(e[0],
                                                                       e[1])],
                                                             in_different=[])

        if trace_on:
            logger.info('Best score returned by lca_alg1_constrained is %s' %
                        (new_score, ))
            logger.info(
                'Checking',
                ct.clustering_score(
                    G, ct.build_node_to_cluster_mapping(new_clustering)),
            )
        if new_score > best_score:
            if trace_on:
                logger.info('New best')
            best_score = new_score
            best_clustering = new_clustering

    return best_clustering, best_score
Exemple #18
0
def lca_alg1(curr_G, stop_at_two=False, trace_on=False):
    if len(curr_G) == 0:
        return {}, 0
    elif len(curr_G) == 1:
        clustering = {0: set(curr_G.nodes())}
        return clustering, 0

    neg_edges, pos_edges = ct.get_weight_lists(curr_G, sort_positive=True)
    clustering = {c: {n} for c, n in enumerate(sorted(curr_G.nodes()))}
    node2cid = ct.build_node_to_cluster_mapping(clustering)

    G_prime = nx.Graph()
    G_prime.add_nodes_from(curr_G)
    G_prime.add_weighted_edges_from(neg_edges)
    score = ct.clustering_score(G_prime, node2cid)

    if trace_on:
        logger.info('====================')
        logger.info('====  lca_alg1  ====')
        logger.info('====================')
        ct.print_structures(G_prime, clustering, node2cid, score)

    for e in pos_edges:
        if trace_on:
            logger.info('=======================')
            logger.info('Start of next iteration')
            logger.info('=======================')
        if e[0] < e[1]:
            n0, n1 = e[0], e[1]
        else:
            n1, n0 = e[0], e[1]
        wgt = e[2]
        n0_cid, n1_cid = node2cid[n0], node2cid[n1]
        if trace_on:
            logger.info('n0=%s, n1=%s, wgt=%a, n0_cid=%a, n1_cid=%a' %
                        (n0, n1, wgt, n0_cid, n1_cid))

        is_merge_allowed = not stop_at_two or len(clustering) > 2
        if trace_on:
            logger.info('is_merge_allowed %s' % (is_merge_allowed, ))

        if n0_cid == n1_cid:
            if trace_on:
                logger.info('In the same cluster')
            score += wgt
        elif is_merge_allowed and not ct.has_edges_between_them(
                G_prime, clustering[n0_cid], clustering[n1_cid]):
            if trace_on:
                logger.info('Merging disjoint clusters')
            sc_delta = ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering,
                                         node2cid)
            assert sc_delta == 0
            score += sc_delta + wgt  # why might sc_delta be non-zero here???
        else:
            sc_merged = (ct.score_delta_after_merge(n0_cid, n1_cid, G_prime,
                                                    clustering) + wgt)
            if trace_on:
                logger.info('sc_merged=%a' % sc_merged)
            sc_unmerged = -wgt
            if trace_on:
                logger.info('sc_unmerged=%a' % sc_unmerged)
            if len(clustering[n0_cid]) == 1 or len(clustering[n1_cid]) == 1:
                sc_n0_to_n1 = sc_n1_to_n0 = min(sc_merged, sc_unmerged) - 9999
                n0_to_move = n1_to_move = []
                if trace_on:
                    logger.info('not checking moving nodes because '
                                'at least one cluster is length 1')
            else:
                sc_n0_to_n1, n0_to_move = best_shift(n0,
                                                     n1,
                                                     G_prime,
                                                     clustering,
                                                     node2cid,
                                                     trace_on=trace_on)
                sc_n0_to_n1 += wgt
                if trace_on:
                    logger.info('sc_n0_to_n1=%a, n0_to_move=%a' %
                                (sc_n0_to_n1, n0_to_move))
                sc_n1_to_n0, n1_to_move = best_shift(n1,
                                                     n0,
                                                     G_prime,
                                                     clustering,
                                                     node2cid,
                                                     trace_on=trace_on)
                sc_n1_to_n0 += wgt
                if trace_on:
                    logger.info('sc_n1_to_n0=%a, n1_to_move=%a' %
                                (sc_n1_to_n0, n1_to_move))

            if is_merge_allowed and sc_merged >= max(sc_unmerged, sc_n0_to_n1,
                                                     sc_n1_to_n0):
                ct.merge_clusters(n0_cid, n1_cid, G_prime, clustering,
                                  node2cid)
                score += sc_merged
                if trace_on:
                    logger.info('Choose merge')
            elif sc_unmerged >= max(sc_n0_to_n1, sc_n1_to_n0):
                score += sc_unmerged
                if trace_on:
                    logger.info('Choose unmerged - unchanged')
            elif sc_n0_to_n1 >= sc_n1_to_n0:
                ct.shift_between_clusters(n0_cid, n0_to_move, n1_cid,
                                          clustering, node2cid)
                score += sc_n0_to_n1
                if trace_on:
                    logger.info('Choose to shift from cluster %a to %a' %
                                (n0_cid, n1_cid))
            else:
                ct.shift_between_clusters(n1_cid, n1_to_move, n0_cid,
                                          clustering, node2cid)
                score += sc_n1_to_n0
                if trace_on:
                    logger.info('Choose to shift from cluster %a to %a' %
                                (n1_cid, n0_cid))
        G_prime.add_weighted_edges_from([e])
        if trace_on:
            ct.print_structures(G_prime, clustering, node2cid, score)

    return clustering, score
Exemple #19
0
    def generate(self):
        expected_nodes = 1 + self.params['gamma_shape'] * self.params['gamma_scale']

        digits_per_node = 2 + int(m.log10(expected_nodes))
        next_index = 0
        nodes = []  # list of node ids
        edges = []  # list of edge 3-tuples

        num_correct_positive = 0
        num_correct_negative = 0
        num_correct_zero = 0
        num_incorrect_positive = 0
        num_incorrect_negative = 0
        num_incorrect_zero = 0

        """
        Step 0:
        """
        samples = np.random.gamma(
            self.params['gamma_shape'],
            self.params['gamma_scale'],
            self.params['num_clusters'],
        )
        samples = 1 + np.round(samples)
        samples = samples.astype(int)

        """
        Step 1:
        Generate the clusters, the nodes within the cluster, and the
        "correct" inter-cluster edges.  Note that since we are
        assuming an imperfect ranking algorithm, this does not ensure
        that each cluster is connected.
        """
        num_from_ranker = self.params['num_from_ranker']
        cids = ct.cids_from_range(len(samples), prefix='ct')
        for i, cid in enumerate(cids):
            self.gt_clustering[cid] = list()

            n = samples[i]

            # Create the nodes in the cluster
            for i in range(n):
                node_id = 'n' + str(next_index).zfill(digits_per_node)
                next_index += 1
                nodes.append(node_id)
                self.gt_clustering[cid].append(node_id)
                self.gt_node2cid[node_id] = cid
                self.ranker_matches[node_id] = set()

            #  Create the positive edges between nodes in a cluster.
            #  These are symmetric. Don't allow more than num_from_ranker
            #  matches / edges for any node.
            for i, ith_node in enumerate(self.gt_clustering[cid]):
                for j in range(i + 1, len(self.gt_clustering[cid])):
                    prob = random.uniform(0, 1)
                    jth_node = self.gt_clustering[cid][j]
                    if prob < self.params['p_ranker_correct'] and \
                       len(self.ranker_matches[ith_node]) < num_from_ranker and \
                       len(self.ranker_matches[jth_node]) < num_from_ranker:
                        self.ranker_matches[ith_node].add(jth_node)
                        self.ranker_matches[jth_node].add(ith_node)
                        is_match_correct = True
                        wgt = self.wgtr.random_wgt(is_match_correct)
                        if wgt > 0:
                            num_correct_positive += 1
                        elif wgt == 0:
                            num_correct_zero += 1
                        else:
                            num_correct_negative += 1

                        e = (ith_node, jth_node, wgt)
                        edges.append(e)

        assert num_from_ranker > 0
        num_nodes = len(nodes)

        # Change the list to a set
        self.gt_clustering = {
            cid: set(cluster) for cid, cluster in self.gt_clustering.items()
        }

        """
        Step 2:
        Generate "incorrect" match edges, sufficient to have the required
        number of edges generated by the ranking algorithm.
        """
        for i, ith_node in enumerate(nodes):
            matches = self.ranker_matches[ith_node]
            cid = self.gt_node2cid[ith_node]
            cluster = set(self.gt_clustering[cid])

            """
            Generate (incorrect) edges between clusters
            """
            is_match_correct = False
            while len(matches) < num_from_ranker:
                j = random.randint(0, num_nodes - 1)
                jth_node = nodes[j]
                if jth_node not in matches and jth_node not in cluster:
                    matches.add(jth_node)
                    wgt = self.wgtr.random_wgt(is_match_correct)
                    if wgt > 0:
                        num_incorrect_positive += 1
                    elif wgt == 0:
                        num_incorrect_zero += 1
                    else:
                        num_incorrect_negative += 1

                    if ith_node < jth_node:
                        e = (ith_node, jth_node, wgt)
                    else:
                        e = (jth_node, ith_node, wgt)
                    edges.append(e)

        self.G.add_weighted_edges_from(edges)
        logging.info('simulator::generate: adding %d edges' % len(edges))
        logging.info('%d correct match edges have positive weight' % num_correct_positive)
        logging.info('%d correct match edges have zero weight' % num_correct_zero)
        logging.info('%d correct match edges have negative weight' % num_correct_negative)
        logging.info(
            '%d incorrect match edges have positive weight' % num_incorrect_positive
        )
        logging.info('%d incorrect match edges have zero weight' % num_incorrect_zero)
        logging.info(
            '%d incorrect match edges have negative weight' % num_incorrect_negative
        )

        self.G_orig.add_nodes_from(self.G)
        self.G_orig.add_weighted_edges_from(edges)

        """
        Step 3: Generate the "reachable" ground truth, the obtainable
        result given simulated failures to match that could disconnect
        a correct match.
        """
        self.r_clustering = dict()
        k = 0
        for cc in self.gt_clustering.values():
            H = self.G.subgraph(cc)
            prev_k = k
            for new_cc in nx.connected_components(H):
                self.r_clustering[k] = new_cc
                k += 1
            if k - prev_k > 1:
                logger.info('GT cluster %a split into %a ...' % (cc, k - prev_k))
                for i in range(prev_k, k):
                    logger.info('   %a' % self.r_clustering[i])
            else:
                logger.info('GT cluster %a is intact' % cc)
        self.r_node2cid = ct.build_node_to_cluster_mapping(self.r_clustering)

        """
        Step 4: Reconfigure edges to maks the expected input to the
        graph algorithm weight manager.
        """
        aug_names = ['verifier', 'human']
        edges = [(n0, n1, w, aug_names[0]) for n0, n1, w in edges]

        return edges, aug_names
Exemple #20
0
 def set_to_clusters(self, to_clusters, to_score):
     self.to_clusters = to_clusters
     self.to_score = to_score
     self.to_n2c = ct.build_node_to_cluster_mapping(self.to_clusters)
     self.inconsistent = []