예제 #1
0
def test_shc_custom_affinity():
    """Test custom affinity function in SHC."""
    X, _ = generate_data(supervised=False, affinity=False)
    clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
                                            n_clusters=4)
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #2
0
def hcluster(X, attrs):
    """
    Hierarchical Clustering.
    Return Example:
        {'children': [
            {'children': [], 'name': 2, 'value': 150.0039243544126},
            {'children': [
                {'children': [], 'name': 1, 'value': 2.509279181210386},
                {'children': [
                    {'children': [], 'name': 0, 'value': 2.4987419269136737},
                    {'children': [], 'name': 3, 'value': 2.4987419269136737}
                ], 'name': 4,'value': 4.997483853827347}
            ], 'name': 5, 'value': 5.018558362420772}
        ], 'name': 6, 'value': 300.0078487088252}
    """
    n_clusters = int(attrs['kNumber'])
    hcluster = ScipyHierarchicalClustering(method=attrs['distance'],
                                           affinity=attrs['affinity'],
                                           n_clusters=n_clusters)

    hcluster.fit(X)
    labels = hcluster.labels_

    # Z = hcluster.linkage_
    # return HClusterTree(Z).to_dict()

    save_clusterer(hcluster)
    return scatterplot(X, labels, n_clusters)
예제 #3
0
def hcluster(X, attrs):
    """
    Hierarchical Clustering.
    Return Example:
        {'children': [
            {'children': [], 'name': 2, 'value': 150.0039243544126},
            {'children': [
                {'children': [], 'name': 1, 'value': 2.509279181210386},
                {'children': [
                    {'children': [], 'name': 0, 'value': 2.4987419269136737},
                    {'children': [], 'name': 3, 'value': 2.4987419269136737}
                ], 'name': 4,'value': 4.997483853827347}
            ], 'name': 5, 'value': 5.018558362420772}
        ], 'name': 6, 'value': 300.0078487088252}
    """
    n_clusters = int(attrs['kNumber'])
    hcluster = ScipyHierarchicalClustering(method=attrs['distance'],
                                           affinity=attrs['affinity'],
                                           n_clusters=n_clusters)

    hcluster.fit(X)
    labels = hcluster.labels_

    # Z = hcluster.linkage_
    # return HClusterTree(Z).to_dict()

    # save_clusterer(hcluster)
    return scatterplot(X, labels, n_clusters)
예제 #4
0
def test_shc_precomputed_distance():
    """Test using precomputed distances in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)
    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=4)
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #5
0
def test_shc_precomputed_distance():
    """Test using precomputed distances in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)
    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=4)
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #6
0
def test_shc_custom_affinity():
    """Test custom affinity function in SHC."""
    X, _ = generate_data(supervised=False, affinity=False)
    clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
                                            n_clusters=4)
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #7
0
def test_shc_unsupervised_scoring_data_raw():
    """Test unsupervised clustering for SHC when scoring_data='raw'."""
    X, _ = generate_data(supervised=False, affinity=False)
    _scoring = partial(silhouette_score, metric="euclidean")
    clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
                                            scoring=_scoring,
                                            scoring_data="raw")
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #8
0
def test_shc_unsupervised_scoring_data_None():
    """Test unsupervised clustering for SHC when scoring_data is None."""
    X, _ = generate_data(supervised=False, affinity=False)

    def _scoring(labels_pred):
        return -np.inf

    clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
                                            scoring=_scoring)
    labels = clusterer.fit_predict(X)
    assert_array_equal([100], np.bincount(labels))
예제 #9
0
def test_shc_threshold():
    """Test changing threshold in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)

    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            scoring_data="affinity",
                                            n_clusters=4)

    labels = clusterer.fit_predict(X)
    clusterer.set_params(threshold=clusterer.linkage_[-4,  2])
    labels = clusterer.labels_
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #10
0
def test_shc_n_clusters():
    """Test changing number of clusters in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)

    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=4)

    labels = clusterer.fit_predict(X)
    assert_equal(len(np.unique(labels)), 4)
    clusterer.set_params(n_clusters=10)
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 10)
예제 #11
0
def test_shc_n_clusters():
    """Test changing number of clusters in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)

    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=4)

    labels = clusterer.fit_predict(X)
    assert_equal(len(np.unique(labels)), 4)
    clusterer.set_params(n_clusters=10)
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 10)
예제 #12
0
def test_shc_semi_supervised_scoring_data_none():
    """Test semi-supervised learning for SHC when scoring_data is None."""
    X, y = generate_data(supervised=True, affinity=False)

    def _scoring(labels_true, labels_pred):
        score = b3_f_score(labels_true, labels_pred)
        return score

    # We should find all 4 clusters
    clusterer = ScipyHierarchicalClustering(scoring=_scoring)
    clusterer.fit(X, y)
    labels = clusterer.labels_
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #13
0
def test_shc_semi_supervised_scoring_data_raw():
    """Test semi-supervised learning for SHC when scoring_data='raw'."""
    X, y = generate_data(supervised=True, affinity=False)

    def _scoring(X_raw, labels_true, labels_pred):
        assert X_raw.shape == X.shape
        score = b3_f_score(labels_true, labels_pred)
        return score

    clusterer = ScipyHierarchicalClustering(scoring=_scoring,
                                            scoring_data="raw")
    clusterer.fit(X, y)
    labels = clusterer.labels_
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #14
0
def test_shc_semi_supervised_scoring_data_affinity():
    """Test semi-supervised learning for SHC when scoring_data='affinity'."""
    # Passing feature matrix
    X1, y1 = generate_data(supervised=True, affinity=False)

    def _scoring1(X_affinity, labels_true, labels_pred):
        assert X_affinity.shape[0] == X_affinity.shape[1]
        assert X_affinity.shape != X1.shape
        score = b3_f_score(labels_true, labels_pred)
        return score

    clusterer = ScipyHierarchicalClustering(scoring=_scoring1,
                                            scoring_data="affinity",
                                            affinity=euclidean_distances)
    clusterer.fit(X1, y1)
    labels = clusterer.labels_
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))

    # Passing affinity matrix
    X2, y2 = generate_data(supervised=True, affinity=True)

    def _scoring2(X_affinity, labels_true, labels_pred):
        assert X_affinity.shape[0] == X_affinity.shape[1]
        assert X_affinity.shape == X2.shape
        score = b3_f_score(labels_true, labels_pred)
        return score

    clusterer = ScipyHierarchicalClustering(scoring=_scoring2,
                                            scoring_data="affinity",
                                            affinity="precomputed")
    clusterer.fit(X2, y2)
    labels = clusterer.labels_
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #15
0
def test_onthefly_labels():
    clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering(
        n_clusters=1, method="complete"))
    clusterer.fit(X)
    assert_array_equal([100], np.bincount(clusterer.labels_))
    clusterer.clusterers_[0].set_params(n_clusters=4)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #16
0
def test_shc_validation():
    """Test the validation of hyper-parameters and input data in SHC"""
    X, _ = generate_data(supervised=False, affinity=False)

    with pytest.raises(ValueError):
        clusterer = ScipyHierarchicalClustering(n_clusters=len(X) + 1)
        labels = clusterer.fit_predict(X)

    with pytest.raises(ValueError):
        clusterer = ScipyHierarchicalClustering(n_clusters=-1)
        labels = clusterer.fit_predict(X)

    with pytest.raises(ValueError):
        clusterer = ScipyHierarchicalClustering(scoring_data="affinity")
        labels = clusterer.fit_predict(X)
예제 #17
0
 def fit(self, n_jobs=8):
     self.clusterer = BlockClustering(
         blocking=self.block_function,
         base_estimator=ScipyHierarchicalClustering(
             affinity=_affinity,
             threshold=self.clustering_threshold,
             method=self.clustering_method,
             supervised_scoring=b3_f_score),
         n_jobs=n_jobs,
         verbose=True)
     self.clusterer.fit(self.X, self.y)
예제 #18
0
def test_shc_threshold():
    """Test changing threshold in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)

    # n_clusters has precedence over threshold
    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=2)
    labels1 = clusterer.fit_predict(X)
    clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
    labels2 = clusterer.labels_
    assert_array_equal(labels1, labels2)
    assert_equal(len(np.unique(labels1)), 2)

    # change threshold
    clusterer.set_params(best_threshold_precedence=False)
    clusterer.set_params(n_clusters=None,
                         threshold=clusterer.linkage_[-5, 2])
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 5)
    clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 4)
예제 #19
0
def test_fit(n_jobs):
    """Test fit."""
    # Single block
    clusterer = BlockClustering(blocking="single",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=4, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X)

    assert_equal(len(clusterer.clusterers_), 1)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed blocks
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed affinity
    clusterer = BlockClustering(affinity="precomputed",
                                blocking="precomputed",
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity="precomputed",
                                    n_clusters=2,
                                    method="complete"),
                                n_jobs=n_jobs)
    X_affinity = euclidean_distances(X)
    clusterer.fit(X_affinity, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Custom blocking function
    X_ids = np.arange(len(X)).reshape((-1, 1))

    def _blocking(X_ids):
        return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}

    clusterer = BlockClustering(blocking=_blocking,
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2,
                                    linkage="complete",
                                    affinity=_distance))
    clusterer.fit(X_ids)

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #20
0
def test_shc_threshold():
    """Test changing threshold in SHC."""
    X, _ = generate_data(supervised=False, affinity=True)

    # n_clusters has precedence over threshold
    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            n_clusters=2)
    labels1 = clusterer.fit_predict(X)
    clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
    labels2 = clusterer.labels_
    assert_array_equal(labels1, labels2)
    assert_equal(len(np.unique(labels1)), 2)

    # change threshold
    clusterer.set_params(n_clusters=None, threshold=clusterer.linkage_[-5, 2])
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 5)
    clusterer.set_params(threshold=clusterer.linkage_[-4, 2])
    labels = clusterer.labels_
    assert_equal(len(np.unique(labels)), 4)
예제 #21
0
def test_shc_unsupervised_scoring_data_affinity():
    """Test unsupervised clustering for SHC when scoring_data='affinity'."""
    # Passing feature matrix
    X, _ = generate_data(supervised=False, affinity=False)
    _scoring = partial(silhouette_score, metric="precomputed")
    clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances,
                                            scoring=_scoring,
                                            scoring_data="affinity")
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))

    # Passing affinity matrix
    X, _ = generate_data(supervised=False, affinity=True)
    _scoring = partial(silhouette_score, metric="precomputed")
    clusterer = ScipyHierarchicalClustering(affinity="precomputed",
                                            scoring=_scoring,
                                            scoring_data="affinity")
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #22
0
                    test_size=args.clustering_test_size,
                    random_state=args.clustering_random_state)

                y = -np.ones(len(X), dtype=np.int)
                y[train] = y_true[train]

            else:
                y = y_true

        else:
            y = None

        clusterer = BlockClustering(blocking=blocking,
                                    base_estimator=ScipyHierarchicalClustering(
                                        affinity=affinity,
                                        threshold=args.clustering_threshold,
                                        method=args.clustering_method,
                                        scoring=b3_f_score),
                                    verbose=args.verbose,
                                    n_jobs=args.n_jobs).fit(X, y)

        labels = clusterer.labels_

        # Save predicted clusters
        if args.output_clusters:
            clusters = {}

            for label in np.unique(labels):
                mask = (labels == label)
                clusters[label] = [r[0]["signature_id"] for r in X[mask]]
예제 #23
0
def test_shc_default_euclidean():
    """Test default parameters of SHC, using euclidean distance."""
    X, _ = generate_data(supervised=False, affinity=False)
    clusterer = ScipyHierarchicalClustering(n_clusters=4)
    labels = clusterer.fit_predict(X)
    assert_array_equal([25, 25, 25, 25], np.bincount(labels))
예제 #24
0
    return distances


if __name__ == "__main__":
    # Load data
    data = np.load("data/author-disambiguation.npz")
    X = data["X"]
    truth = data["y"]

    print("hello")

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(threshold=0.5,
                                                   affinity=affinity,
                                                   method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()
예제 #25
0
def clustering(input_signatures,
               input_records,
               distance_model,
               input_clusters=None,
               output_clusters=None,
               verbose=1,
               n_jobs=-1,
               clustering_method="average",
               train_signatures_file=None,
               clustering_threshold=None,
               results_file=None,
               blocking_function="block_phonetic",
               blocking_threshold=1,
               blocking_phonetic_alg="nysiis"):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param train_signatures_file: str
        Path to the file with train set signatures. Format the same as in
        ``input_signatures``.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    :param results_file: str
        Path to the file where the results will be output. It will give
        additional information about pairwise variant of scores.

    :param blocking_function: string
        must be a defined blocking function. Defined functions are:
        - "block_last_name_first_initial"
        - "block_phonetic"

    :param blocking_threshold: int or None
        It determines the maximum allowed size of blocking on the last name
        It can only be:
        -   None; if the blocking function is block_last_name_first_initial
        -   int; if the blocking function is block_phonetic
            please check the documentation of phonetic blocking in
            beard.clustering.blocking_funcs.py

    :param blocking_phonetic_alg: string or None
        If not None, determines which phonetic algorithm is used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)
    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator
    distance_estimator = pickle.load(open(distance_model, "rb"))

    try:
        distance_estimator.steps[-1][1].set_params(n_jobs=1)
    except:
        pass

    signatures, records = load_signatures(input_signatures, input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(
            sorted(signatures.values(), key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    if blocking_function == "block_last_name_first_initial":
        block_function = block_last_name_first_initial
    else:
        block_function = partial(block_phonetic,
                                 threshold=blocking_threshold,
                                 phonetic_algorithm=blocking_phonetic_alg)

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        y = -np.ones(len(X), dtype=np.int)

        if train_signatures_file:
            train_signatures = json.load(open(train_signatures_file, "r"))
            train_ids = [x['signature_id'] for x in train_signatures]
            del train_signatures
            y[train_ids] = y_true[train_ids]
            test_ids = list(
                set([x['signature_id']
                     for _, x in signatures.iteritems()]) - set(train_ids))
        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(blocking=block_function,
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity=_affinity,
                                    threshold=clustering_threshold,
                                    method=clustering_method,
                                    supervised_scoring=b3_f_score),
                                verbose=verbose,
                                n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))

        b3_overall = b3_precision_recall_fscore(y_true, labels)
        print("B^3 F-score (overall) =", b3_overall[2])

        if train_signatures_file:
            b3_train = b3_precision_recall_fscore(y_true[train_ids],
                                                  labels[train_ids])
            b3_test = b3_precision_recall_fscore(y_true[test_ids],
                                                 labels[test_ids])
            print("B^3 F-score (train) =", b3_train[2])
            print("B^3 F-score (test) =", b3_test[2])
            if results_file:
                paired_overall = paired_precision_recall_fscore(y_true, labels)
                paired_train = paired_precision_recall_fscore(
                    y_true[train_ids], labels[train_ids])
                paired_test = paired_precision_recall_fscore(
                    y_true[test_ids], labels[test_ids])

                json.dump(
                    {
                        "description": ["precision", "recall", "f_score"],
                        "b3": {
                            "overall": list(b3_overall),
                            "train": list(b3_train),
                            "test": list(b3_test)
                        },
                        "paired": {
                            "overall": list(paired_overall),
                            "train": list(paired_train),
                            "test": list(paired_test)
                        }
                    }, open(results_file, 'w'))