예제 #1
0
def test_onthefly_labels():
    clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering(
        n_clusters=1, method="complete"))
    clusterer.fit(X)
    assert_array_equal([100], np.bincount(clusterer.labels_))
    clusterer.clusterers_[0].set_params(n_clusters=4)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #2
0
def test_onthefly_labels():
    clusterer = BlockClustering(
        base_estimator=ScipyHierarchicalClustering(n_clusters=1,
                                                   method="complete"))
    clusterer.fit(X)
    assert_array_equal([100], np.bincount(clusterer.labels_))
    clusterer.clusterers_[0].set_params(n_clusters=4)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #3
0
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm='nysiis')

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = 'average'

    def load_data(self, signatures_path, publications_path,
                  input_clusters_path):
        signatures_by_uuid = load_signatures(signatures_path,
                                             publications_path)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        with open(input_clusters_path, 'r') as fd:
            for line in fd:
                cluster = json.loads(line)
                for signature_uuid in cluster['signature_uuids']:
                    if signature_uuid not in signatures_by_uuid:
                        continue  # TODO figure out how this can happen
                    self.X[i, 0] = signatures_by_uuid[signature_uuid]
                    self.y[i] = cluster['cluster_id']
                    i += 1

    def load_model(self, input_filename):
        with open(input_filename, 'rb') as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        with open_file_in_folder(output_filename, 'wb') as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score),
            n_jobs=n_jobs,
            verbose=True)
        self.clusterer.fit(self.X, self.y)
예제 #4
0
def test_predict():
    """Test predict."""
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(X, blocks=(y <= 1))
    pred = clusterer.predict(X, blocks=(y <= 1))
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    pred = clusterer.predict(X, blocks=10 * np.ones(len(X)))
    assert_array_equal(-np.ones(len(X)), pred)
예제 #5
0
 def fit(self, n_jobs=8):
     self.clusterer = BlockClustering(
         blocking=self.block_function,
         base_estimator=ScipyHierarchicalClustering(
             affinity=_affinity,
             threshold=self.clustering_threshold,
             method=self.clustering_method,
             supervised_scoring=b3_f_score),
         n_jobs=n_jobs,
         verbose=True)
     self.clusterer.fit(self.X, self.y)
예제 #6
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
예제 #7
0
def test_predict():
    """Test predict."""
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(X, blocks=(y <= 1))
    pred = clusterer.predict(X, blocks=(y <= 1))
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    pred = clusterer.predict(X, blocks=10 * np.ones(len(X)))
    assert_array_equal(-np.ones(len(X)), pred)
예제 #8
0
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks
        # into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm="nysiis")

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = "average"

    def load_data(self, signatures, input_clusters):
        """Loads data to the estimator vectors

        Args:
            signatures (iterable): Signatures which should be processed
            input_clusters (iterable): Input clusters built for provided signatures
                see: `inspire_disambiguation.core.es.readers.get_input_clusters`

        """
        signatures_by_uuid = load_signatures(signatures)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        for cluster in input_clusters:
            for signature_uuid in cluster["signature_uuids"]:
                if signature_uuid not in signatures_by_uuid:
                    continue  # TODO figure out how this can happen
                self.X[i, 0] = signatures_by_uuid[signature_uuid]
                self.y[i] = cluster["cluster_id"]
                i += 1

    def load_model(self, input_filename):
        """Loads model dumped by pickle

        Args:
            input_filename (str): path to file with dumped ethnicity model.
        """
        with open(input_filename, "rb") as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        """Dump object to a file

        Args:
            output_filename (str): Path where model will be dumped
        """
        with open_file_in_folder(output_filename, "wb") as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        """Fit data using the estimator"""
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score,
            ),
            n_jobs=n_jobs,
            verbose=True,
        )
        self.clusterer.fit(self.X, self.y)
예제 #9
0
def test_partial_fit():
    """Test partial_fit."""
    blocks = (y <= 1)

    clusterer1 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1])
    assert_equal(len(clusterer1.clusterers_), 1)
    clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1])
    assert_equal(len(clusterer1.clusterers_), 2)

    clusterer2 = BlockClustering(blocking="precomputed",
                                 base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer2.fit(X, blocks=blocks)

    c1 = clusterer1.predict(X, blocks=blocks)
    c2 = clusterer2.labels_

    assert_equal(paired_f_score(c1, c2), 1.0)
예제 #10
0
def test_fit(n_jobs):
    """Test fit."""
    # Single block
    clusterer = BlockClustering(blocking="single",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=4, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X)

    assert_equal(len(clusterer.clusterers_), 1)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed blocks
    clusterer = BlockClustering(blocking="precomputed",
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2, linkage="complete"),
                                n_jobs=n_jobs)
    clusterer.fit(X, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed affinity
    clusterer = BlockClustering(affinity="precomputed",
                                blocking="precomputed",
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity="precomputed",
                                    n_clusters=2,
                                    method="complete"),
                                n_jobs=n_jobs)
    X_affinity = euclidean_distances(X)
    clusterer.fit(X_affinity, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Custom blocking function
    X_ids = np.arange(len(X)).reshape((-1, 1))

    def _blocking(X_ids):
        return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}

    clusterer = BlockClustering(blocking=_blocking,
                                base_estimator=AgglomerativeClustering(
                                    n_clusters=2,
                                    linkage="complete",
                                    affinity=_distance))
    clusterer.fit(X_ids)

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #11
0
def clustering(input_signatures,
               input_records,
               distance_model,
               input_clusters=None,
               output_clusters=None,
               verbose=1,
               n_jobs=-1,
               clustering_method="average",
               train_signatures_file=None,
               clustering_threshold=None,
               results_file=None,
               blocking_function="block_phonetic",
               blocking_threshold=1,
               blocking_phonetic_alg="nysiis"):
    """Cluster signatures using a pretrained distance model.

    Parameters
    ----------
    :param input_signatures: string
        Path to the file with signatures. The content should be a JSON array
        of dictionaries holding metadata about signatures.

        [{"signature_id": 0,
          "author_name": "Doe, John",
          "publication_id": 10, ...}, { ... }, ...]

    :param input_records: string
        Path to the file with records. The content should be a JSON array of
        dictionaries holding metadata about records

        [{"publication_id": 0,
          "title": "Author disambiguation using Beard", ... }, { ... }, ...]

    :param distance_model: string
        Path to the file with the distance model. The file should be a pickle
        created using the ``distance.py`` script.

    :param input_clusters: string
        Path to the file with knownn clusters. The file should be a dictionary,
        where keys are cluster labels and values are the `signature_id` of the
        signatures grouped in the clusters. Signatures assigned to the cluster
        with label "-1" are not clustered.

        {"0": [0, 1, 3], "1": [2, 5], ...}

    :param output_clusters: string
        Path to the file with output cluster. The file will be filled with
        clusters, using the same format as ``input_clusters``.

    :param verbose: int
        If not zero, function will output scores on stdout.

    :param n_jobs: int
        Parameter passed to joblib. Number of threads to be used.

    :param clustering_method: string
        Parameter passed to ``ScipyHierarchicalClustering``. Used only if
        ``clustering_test_size`` is specified.

    :param train_signatures_file: str
        Path to the file with train set signatures. Format the same as in
        ``input_signatures``.

    :param clustering_threshold: float
        Threshold passed to ``ScipyHierarchicalClustering``.

    :param results_file: str
        Path to the file where the results will be output. It will give
        additional information about pairwise variant of scores.

    :param blocking_function: string
        must be a defined blocking function. Defined functions are:
        - "block_last_name_first_initial"
        - "block_phonetic"

    :param blocking_threshold: int or None
        It determines the maximum allowed size of blocking on the last name
        It can only be:
        -   None; if the blocking function is block_last_name_first_initial
        -   int; if the blocking function is block_phonetic
            please check the documentation of phonetic blocking in
            beard.clustering.blocking_funcs.py

    :param blocking_phonetic_alg: string or None
        If not None, determines which phonetic algorithm is used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)
    """
    # Assumes that 'distance_estimator' lives in global, making things fast
    global distance_estimator
    distance_estimator = pickle.load(open(distance_model, "rb"))

    try:
        distance_estimator.steps[-1][1].set_params(n_jobs=1)
    except:
        pass

    signatures, records = load_signatures(input_signatures, input_records)

    indices = {}
    X = np.empty((len(signatures), 1), dtype=np.object)
    for i, signature in enumerate(
            sorted(signatures.values(), key=lambda s: s["signature_id"])):
        X[i, 0] = signature
        indices[signature["signature_id"]] = i

    if blocking_function == "block_last_name_first_initial":
        block_function = block_last_name_first_initial
    else:
        block_function = partial(block_phonetic,
                                 threshold=blocking_threshold,
                                 phonetic_algorithm=blocking_phonetic_alg)

    # Semi-supervised block clustering
    if input_clusters:
        true_clusters = json.load(open(input_clusters, "r"))
        y_true = -np.ones(len(X), dtype=np.int)

        for label, signature_ids in true_clusters.items():
            for signature_id in signature_ids:
                y_true[indices[signature_id]] = label

        y = -np.ones(len(X), dtype=np.int)

        if train_signatures_file:
            train_signatures = json.load(open(train_signatures_file, "r"))
            train_ids = [x['signature_id'] for x in train_signatures]
            del train_signatures
            y[train_ids] = y_true[train_ids]
            test_ids = list(
                set([x['signature_id']
                     for _, x in signatures.iteritems()]) - set(train_ids))
        else:
            y = y_true

    else:
        y = None

    clusterer = BlockClustering(blocking=block_function,
                                base_estimator=ScipyHierarchicalClustering(
                                    affinity=_affinity,
                                    threshold=clustering_threshold,
                                    method=clustering_method,
                                    supervised_scoring=b3_f_score),
                                verbose=verbose,
                                n_jobs=n_jobs).fit(X, y)

    labels = clusterer.labels_

    # Save predicted clusters
    if output_clusters:
        clusters = {}

        for label in np.unique(labels):
            mask = (labels == label)
            clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]]

        json.dump(clusters, open(output_clusters, "w"))

    # Statistics
    if verbose and input_clusters:
        print("Number of blocks =", len(clusterer.clusterers_))
        print("True number of clusters", len(np.unique(y_true)))
        print("Number of computed clusters", len(np.unique(labels)))

        b3_overall = b3_precision_recall_fscore(y_true, labels)
        print("B^3 F-score (overall) =", b3_overall[2])

        if train_signatures_file:
            b3_train = b3_precision_recall_fscore(y_true[train_ids],
                                                  labels[train_ids])
            b3_test = b3_precision_recall_fscore(y_true[test_ids],
                                                 labels[test_ids])
            print("B^3 F-score (train) =", b3_train[2])
            print("B^3 F-score (test) =", b3_test[2])
            if results_file:
                paired_overall = paired_precision_recall_fscore(y_true, labels)
                paired_train = paired_precision_recall_fscore(
                    y_true[train_ids], labels[train_ids])
                paired_test = paired_precision_recall_fscore(
                    y_true[test_ids], labels[test_ids])

                json.dump(
                    {
                        "description": ["precision", "recall", "f_score"],
                        "b3": {
                            "overall": list(b3_overall),
                            "train": list(b3_train),
                            "test": list(b3_test)
                        },
                        "paired": {
                            "overall": list(paired_overall),
                            "train": list(paired_train),
                            "test": list(paired_test)
                        }
                    }, open(results_file, 'w'))
예제 #12
0
def test_single_signature(n_jobs):
    """Test clustering of a  single signature."""
    import numbers
    clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(np.array([X[0]]))
    assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)
예제 #13
0
                    test_size=args.clustering_test_size,
                    random_state=args.clustering_random_state)

                y = -np.ones(len(X), dtype=np.int)
                y[train] = y_true[train]

            else:
                y = y_true

        else:
            y = None

        clusterer = BlockClustering(blocking=blocking,
                                    base_estimator=ScipyHierarchicalClustering(
                                        affinity=affinity,
                                        threshold=args.clustering_threshold,
                                        method=args.clustering_method,
                                        scoring=b3_f_score),
                                    verbose=args.verbose,
                                    n_jobs=args.n_jobs).fit(X, y)

        labels = clusterer.labels_

        # Save predicted clusters
        if args.output_clusters:
            clusters = {}

            for label in np.unique(labels):
                mask = (labels == label)
                clusters[label] = [r[0]["signature_id"] for r in X[mask]]

            json.dump(clusters, open(args.output_clusters, "w"))
예제 #14
0
def test_fit(n_jobs):
    """Test fit."""
    # Single block
    clusterer = BlockClustering(
        blocking="single",
        base_estimator=AgglomerativeClustering(n_clusters=4,
                                               linkage="complete"),
        n_jobs=n_jobs)
    clusterer.fit(X)

    assert_equal(len(clusterer.clusterers_), 1)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed blocks
    clusterer = BlockClustering(
        blocking="precomputed",
        base_estimator=AgglomerativeClustering(n_clusters=2,
                                               linkage="complete"),
        n_jobs=n_jobs)
    clusterer.fit(X, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Precomputed affinity
    clusterer = BlockClustering(
        affinity="precomputed",
        blocking="precomputed",
        base_estimator=ScipyHierarchicalClustering(affinity="precomputed",
                                                   n_clusters=2,
                                                   method="complete"),
        n_jobs=n_jobs)
    X_affinity = euclidean_distances(X)
    clusterer.fit(X_affinity, blocks=(y <= 1))

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))

    # Custom blocking function
    X_ids = np.arange(len(X)).reshape((-1, 1))

    def _blocking(X_ids):
        return y[X_ids.ravel()] <= 1  # block labels into {0,1} and {2,3}

    clusterer = BlockClustering(
        blocking=_blocking,
        base_estimator=AgglomerativeClustering(n_clusters=2,
                                               linkage="complete",
                                               affinity=_distance))
    clusterer.fit(X_ids)

    assert_equal(len(clusterer.clusterers_), 2)
    assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
예제 #15
0
def test_validation():
    """Test the validation of hyper-parameters and input data."""
    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="foobar",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X, blocks=(y <= 1))
        clusterer.predict(X)
예제 #16
0
            distances[i, j] = 0.5

    distances += distances.T
    return distances

if __name__ == "__main__":
    # Load data
    data = np.load("data/author-disambiguation.npz")
    X = data["X"]
    truth = data["y"]

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(
            threshold=0.5,
            affinity=affinity,
            method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()
예제 #17
0
    return distances


if __name__ == "__main__":
    # Load data
    data = np.load("data/author-disambiguation.npz")
    X = data["X"]
    truth = data["y"]

    print("hello")

    # Block clustering with fixed threshold
    block_clusterer = BlockClustering(
        blocking=block_last_name_first_initial,
        base_estimator=ScipyHierarchicalClustering(threshold=0.5,
                                                   affinity=affinity,
                                                   method="complete"),
        verbose=3,
        n_jobs=-1)
    block_clusterer.fit(X)
    labels = block_clusterer.labels_

    # Print clusters
    for cluster in np.unique(labels):
        entries = set()

        for name, affiliation in X[labels == cluster]:
            entries.add((name, affiliation))

        print("Cluster #%d = %s" % (cluster, entries))
    print()
예제 #18
0
def test_validation():
    """Test the validation of hyper-parameters and input data."""
    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="foobar", base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X)

    with pytest.raises(ValueError):
        clusterer = BlockClustering(
            blocking="precomputed",
            base_estimator=MiniBatchKMeans(n_clusters=2))
        clusterer.fit(X, blocks=(y <= 1))
        clusterer.predict(X)
class Clusterer(object):
    def __init__(self, estimator):
        # TODO get rid of this global
        global distance_estimator
        distance_estimator = estimator.distance_estimator
        try:
            distance_estimator.steps[-1][1].set_params(n_jobs=1)
        except Exception:
            pass

        # threshold determines when to split blocks
        # into smaller ones adding first initial
        self.block_function = partial(block_phonetic,
                                      threshold=0,
                                      phonetic_algorithm="nysiis")

        self.clustering_threshold = 0.709  # magic value taken from BEARD example
        self.clustering_method = "average"

    def load_data(self, signatures, input_clusters):
        """Loads data to the estimator vectors

        Args:
            signatures (iterable): Signatures which should be processed
            input_clusters (iterable): Input clusters built for provided signatures
                see: `inspire_disambiguation.core.es.readers.get_input_clusters`

        """
        signatures_by_uuid = load_signatures(signatures)

        self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object)
        self.y = -np.ones(len(self.X), dtype=np.int)

        i = 0
        for cluster in input_clusters:
            for signature_uuid in cluster["signature_uuids"]:
                if signature_uuid not in signatures_by_uuid:
                    continue  # TODO figure out how this can happen
                self.X[i, 0] = signatures_by_uuid[signature_uuid]
                self.y[i] = cluster["cluster_id"]
                i += 1

    def load_model(self, input_filename):
        """Loads model dumped by pickle

        Args:
            input_filename (str): path to file with dumped ethnicity model.
        """
        with open(input_filename, "rb") as fd:
            self.clusterer = pickle.load(fd)

    def save_model(self, output_filename):
        """Dump object to a file

        Args:
            output_filename (str): Path where model will be dumped
        """
        with open_file_in_folder(output_filename, "wb") as fd:
            pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL)

    def fit(self, n_jobs=8):
        """Fit data using the estimator"""
        self.clusterer = BlockClustering(
            blocking=self.block_function,
            base_estimator=ScipyHierarchicalClustering(
                affinity=_affinity,
                threshold=self.clustering_threshold,
                method=self.clustering_method,
                supervised_scoring=b3_f_score,
            ),
            n_jobs=n_jobs,
            verbose=True,
        )
        self.clusterer.fit(self.X, self.y)

    def prepare_test_data(self, test_uuids, labels):
        """
        Returns the arrays used for scoring training and test datasets

        Args:
            test_uuids - set of signatures uuids used for testing
            labels - list of labels (author id) for test dataset
        """
        all_uuids = np.vectorize(lambda x: x.signature_uuid)(self.X).flatten()
        test_uuids_array = np.array(list(test_uuids))
        mask = np.isin(all_uuids, test_uuids_array)
        y_train = self.y[~mask]
        y_test = np.array(labels)
        labels_train = self.clusterer.labels_[~mask]
        labels_test = self.clusterer.labels_[mask]
        return labels_train, y_train, labels_test, y_test

    def nb_of_clusters_predicted_for_author(
            self, input_clusters_with_all_author_labels,
            test_signature_authors_ids):
        author_ids = np.array([sample[0]['author_id'] for sample in self.X])
        author_ids[author_ids == None] = test_signature_authors_ids
        signatures_per_author = {
            cluster['author_id']: set(cluster['signature_uuids'])
            for cluster in input_clusters_with_all_author_labels
        }
        nb_of_clusters_per_author = {}
        for author_id in signatures_per_author.keys():
            author_mask = author_ids == author_id
            signatures_predicted_in_one_cluster = self.clusterer.labels_[
                author_mask]
            nb_of_clusters_per_author[author_id] = np.unique(
                signatures_predicted_in_one_cluster).size
        return nb_of_clusters_per_author

    def score(self, labels_train, y_train, labels_test, y_test):
        """
        Return the clustering statistics (b3 precision, b3 recall, b3 f1 score)
        and wrongly clustered samples for training, test and the whole dataset.

        Args:
            labels_train - array of labels predicted for training set
            y_train - array of true labels for training set
            labels_train - array of labels predicted for test set
            y_test - array of true labels for test set
        """
        return (b3_precision_recall_fscore(self.y, self.clusterer.labels_),
                b3_precision_recall_fscore(y_train, labels_train)
                if labels_train.size != 0 else None,
                b3_precision_recall_fscore(y_test, labels_test)
                if labels_test.size != 0 else None)
예제 #20
0
def test_single_signature(n_jobs):
    """Test clustering of a  single signature."""
    import numbers
    clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2))
    clusterer.fit(np.array([X[0]]))
    assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)