Beispiel #1
0
def idrnnclus(
    ref,
    modelin: IO,
    assign_out: TextIO,
    thresh: float,
):
    """
    Identifies clusters by comparing against a reference and forcing a match
    """
    estimator = pickle.load(modelin)
    knn_index = estimator.named_steps["pynndescenttransformer"].index_
    indices, distances = knn_index.query(ref.ref_embeddings, k=32)

    rnndbscan = estimator.named_steps["rnndbscan"]
    labels = rnndbscan.labels_
    unique_labels = np.unique(labels)
    num_clusts = len(unique_labels) - (1 if unique_labels[0] == -1 else 0)

    dists = build_dists(indices, distances, labels, num_clusts, thresh)
    dists = min_pool_sparse(dists, ref.ref_group_sizes)
    ref_ids, clus_ids = min_weight_full_bipartite_matching(dists)

    assign_out.write("label,clus\n")
    ref_labels = list(ref.labels())
    for ref_idx, clus_idx in zip(ref_ids, clus_ids):
        assign_out.write("{},c{}\n".format(ref_labels[ref_idx], clus_idx))
def test_two_methods_give_same_result_on_many_sparse_inputs():
    # As opposed to the test above, here we do not spell out the expected
    # output; only assert that the two methods give the same result.
    # Concretely, the below tests 100 cases of size 100x100, out of which
    # 36 are infeasible.
    np.random.seed(1234)
    for _ in range(100):
        lsa_raises = False
        mwfbm_raises = False
        sparse = random(100,
                        100,
                        density=0.06,
                        data_rvs=lambda size: np.random.randint(1, 100, size))
        # In csgraph, zeros correspond to missing edges, so we explicitly
        # replace those with infinities
        dense = np.full(sparse.shape, np.inf)
        dense[sparse.row, sparse.col] = sparse.data
        sparse = sparse.tocsr()
        try:
            row_ind, col_ind = linear_sum_assignment(dense)
            lsa_cost = dense[row_ind, col_ind].sum()
        except ValueError:
            lsa_raises = True
        try:
            row_ind, col_ind = min_weight_full_bipartite_matching(sparse)
            mwfbm_cost = sparse[row_ind, col_ind].sum()
        except ValueError:
            mwfbm_raises = True
        # Ensure that if one method raises, so does the other one.
        assert lsa_raises == mwfbm_raises
        if not lsa_raises:
            assert lsa_cost == mwfbm_cost
Beispiel #3
0
def test_explicit_zero_causes_warning():
    with pytest.warns(UserWarning):
        biadjacency_matrix = csr_matrix(((2, 0, 3), (0, 1, 1), (0, 2, 3)))
        min_weight_full_bipartite_matching(biadjacency_matrix)
Beispiel #4
0
def test_min_weight_full_matching_infeasible_problems(biadjacency_matrix):
    with pytest.raises(ValueError):
        min_weight_full_bipartite_matching(csr_matrix(biadjacency_matrix))
Beispiel #5
0
def test_min_weight_full_matching_trivial_graph(num_rows, num_cols):
    biadjacency_matrix = csr_matrix((num_cols, num_rows))
    row_ind, col_ind = min_weight_full_bipartite_matching(biadjacency_matrix)
    assert len(row_ind) == 0
    assert len(col_ind) == 0
 def time_evaluation(self, *args):
     min_weight_full_bipartite_matching(self.biadjacency_matrix)