def test_shc_custom_affinity(): """Test custom affinity function in SHC.""" X, _ = generate_data(supervised=False, affinity=False) clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def hcluster(X, attrs): """ Hierarchical Clustering. Return Example: {'children': [ {'children': [], 'name': 2, 'value': 150.0039243544126}, {'children': [ {'children': [], 'name': 1, 'value': 2.509279181210386}, {'children': [ {'children': [], 'name': 0, 'value': 2.4987419269136737}, {'children': [], 'name': 3, 'value': 2.4987419269136737} ], 'name': 4,'value': 4.997483853827347} ], 'name': 5, 'value': 5.018558362420772} ], 'name': 6, 'value': 300.0078487088252} """ n_clusters = int(attrs['kNumber']) hcluster = ScipyHierarchicalClustering(method=attrs['distance'], affinity=attrs['affinity'], n_clusters=n_clusters) hcluster.fit(X) labels = hcluster.labels_ # Z = hcluster.linkage_ # return HClusterTree(Z).to_dict() save_clusterer(hcluster) return scatterplot(X, labels, n_clusters)
def hcluster(X, attrs): """ Hierarchical Clustering. Return Example: {'children': [ {'children': [], 'name': 2, 'value': 150.0039243544126}, {'children': [ {'children': [], 'name': 1, 'value': 2.509279181210386}, {'children': [ {'children': [], 'name': 0, 'value': 2.4987419269136737}, {'children': [], 'name': 3, 'value': 2.4987419269136737} ], 'name': 4,'value': 4.997483853827347} ], 'name': 5, 'value': 5.018558362420772} ], 'name': 6, 'value': 300.0078487088252} """ n_clusters = int(attrs['kNumber']) hcluster = ScipyHierarchicalClustering(method=attrs['distance'], affinity=attrs['affinity'], n_clusters=n_clusters) hcluster.fit(X) labels = hcluster.labels_ # Z = hcluster.linkage_ # return HClusterTree(Z).to_dict() # save_clusterer(hcluster) return scatterplot(X, labels, n_clusters)
def test_shc_precomputed_distance(): """Test using precomputed distances in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_unsupervised_scoring_data_raw(): """Test unsupervised clustering for SHC when scoring_data='raw'.""" X, _ = generate_data(supervised=False, affinity=False) _scoring = partial(silhouette_score, metric="euclidean") clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring, scoring_data="raw") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_unsupervised_scoring_data_None(): """Test unsupervised clustering for SHC when scoring_data is None.""" X, _ = generate_data(supervised=False, affinity=False) def _scoring(labels_pred): return -np.inf clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring) labels = clusterer.fit_predict(X) assert_array_equal([100], np.bincount(labels))
def test_shc_threshold(): """Test changing threshold in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", scoring_data="affinity", n_clusters=4) labels = clusterer.fit_predict(X) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_n_clusters(): """Test changing number of clusters in SHC.""" X, _ = generate_data(supervised=False, affinity=True) clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=4) labels = clusterer.fit_predict(X) assert_equal(len(np.unique(labels)), 4) clusterer.set_params(n_clusters=10) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 10)
def test_shc_semi_supervised_scoring_data_none(): """Test semi-supervised learning for SHC when scoring_data is None.""" X, y = generate_data(supervised=True, affinity=False) def _scoring(labels_true, labels_pred): score = b3_f_score(labels_true, labels_pred) return score # We should find all 4 clusters clusterer = ScipyHierarchicalClustering(scoring=_scoring) clusterer.fit(X, y) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_semi_supervised_scoring_data_raw(): """Test semi-supervised learning for SHC when scoring_data='raw'.""" X, y = generate_data(supervised=True, affinity=False) def _scoring(X_raw, labels_true, labels_pred): assert X_raw.shape == X.shape score = b3_f_score(labels_true, labels_pred) return score clusterer = ScipyHierarchicalClustering(scoring=_scoring, scoring_data="raw") clusterer.fit(X, y) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_shc_semi_supervised_scoring_data_affinity(): """Test semi-supervised learning for SHC when scoring_data='affinity'.""" # Passing feature matrix X1, y1 = generate_data(supervised=True, affinity=False) def _scoring1(X_affinity, labels_true, labels_pred): assert X_affinity.shape[0] == X_affinity.shape[1] assert X_affinity.shape != X1.shape score = b3_f_score(labels_true, labels_pred) return score clusterer = ScipyHierarchicalClustering(scoring=_scoring1, scoring_data="affinity", affinity=euclidean_distances) clusterer.fit(X1, y1) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels)) # Passing affinity matrix X2, y2 = generate_data(supervised=True, affinity=True) def _scoring2(X_affinity, labels_true, labels_pred): assert X_affinity.shape[0] == X_affinity.shape[1] assert X_affinity.shape == X2.shape score = b3_f_score(labels_true, labels_pred) return score clusterer = ScipyHierarchicalClustering(scoring=_scoring2, scoring_data="affinity", affinity="precomputed") clusterer.fit(X2, y2) labels = clusterer.labels_ assert_array_equal([25, 25, 25, 25], np.bincount(labels))
def test_onthefly_labels(): clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering( n_clusters=1, method="complete")) clusterer.fit(X) assert_array_equal([100], np.bincount(clusterer.labels_)) clusterer.clusterers_[0].set_params(n_clusters=4) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_shc_validation(): """Test the validation of hyper-parameters and input data in SHC""" X, _ = generate_data(supervised=False, affinity=False) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(n_clusters=len(X) + 1) labels = clusterer.fit_predict(X) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(n_clusters=-1) labels = clusterer.fit_predict(X) with pytest.raises(ValueError): clusterer = ScipyHierarchicalClustering(scoring_data="affinity") labels = clusterer.fit_predict(X)
def fit(self, n_jobs=8): self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score), n_jobs=n_jobs, verbose=True) self.clusterer.fit(self.X, self.y)
def test_shc_threshold(): """Test changing threshold in SHC.""" X, _ = generate_data(supervised=False, affinity=True) # n_clusters has precedence over threshold clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=2) labels1 = clusterer.fit_predict(X) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels2 = clusterer.labels_ assert_array_equal(labels1, labels2) assert_equal(len(np.unique(labels1)), 2) # change threshold clusterer.set_params(best_threshold_precedence=False) clusterer.set_params(n_clusters=None, threshold=clusterer.linkage_[-5, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 5) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 4)
def test_fit(n_jobs): """Test fit.""" # Single block clusterer = BlockClustering(blocking="single", base_estimator=AgglomerativeClustering( n_clusters=4, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X) assert_equal(len(clusterer.clusterers_), 1) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed blocks clusterer = BlockClustering(blocking="precomputed", base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed affinity clusterer = BlockClustering(affinity="precomputed", blocking="precomputed", base_estimator=ScipyHierarchicalClustering( affinity="precomputed", n_clusters=2, method="complete"), n_jobs=n_jobs) X_affinity = euclidean_distances(X) clusterer.fit(X_affinity, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Custom blocking function X_ids = np.arange(len(X)).reshape((-1, 1)) def _blocking(X_ids): return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} clusterer = BlockClustering(blocking=_blocking, base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete", affinity=_distance)) clusterer.fit(X_ids) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_shc_threshold(): """Test changing threshold in SHC.""" X, _ = generate_data(supervised=False, affinity=True) # n_clusters has precedence over threshold clusterer = ScipyHierarchicalClustering(affinity="precomputed", n_clusters=2) labels1 = clusterer.fit_predict(X) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels2 = clusterer.labels_ assert_array_equal(labels1, labels2) assert_equal(len(np.unique(labels1)), 2) # change threshold clusterer.set_params(n_clusters=None, threshold=clusterer.linkage_[-5, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 5) clusterer.set_params(threshold=clusterer.linkage_[-4, 2]) labels = clusterer.labels_ assert_equal(len(np.unique(labels)), 4)
def test_shc_unsupervised_scoring_data_affinity(): """Test unsupervised clustering for SHC when scoring_data='affinity'.""" # Passing feature matrix X, _ = generate_data(supervised=False, affinity=False) _scoring = partial(silhouette_score, metric="precomputed") clusterer = ScipyHierarchicalClustering(affinity=euclidean_distances, scoring=_scoring, scoring_data="affinity") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels)) # Passing affinity matrix X, _ = generate_data(supervised=False, affinity=True) _scoring = partial(silhouette_score, metric="precomputed") clusterer = ScipyHierarchicalClustering(affinity="precomputed", scoring=_scoring, scoring_data="affinity") labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
test_size=args.clustering_test_size, random_state=args.clustering_random_state) y = -np.ones(len(X), dtype=np.int) y[train] = y_true[train] else: y = y_true else: y = None clusterer = BlockClustering(blocking=blocking, base_estimator=ScipyHierarchicalClustering( affinity=affinity, threshold=args.clustering_threshold, method=args.clustering_method, scoring=b3_f_score), verbose=args.verbose, n_jobs=args.n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if args.output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[label] = [r[0]["signature_id"] for r in X[mask]]
def test_shc_default_euclidean(): """Test default parameters of SHC, using euclidean distance.""" X, _ = generate_data(supervised=False, affinity=False) clusterer = ScipyHierarchicalClustering(n_clusters=4) labels = clusterer.fit_predict(X) assert_array_equal([25, 25, 25, 25], np.bincount(labels))
return distances if __name__ == "__main__": # Load data data = np.load("data/author-disambiguation.npz") X = data["X"] truth = data["y"] print("hello") # Block clustering with fixed threshold block_clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering(threshold=0.5, affinity=affinity, method="complete"), verbose=3, n_jobs=-1) block_clusterer.fit(X) labels = block_clusterer.labels_ # Print clusters for cluster in np.unique(labels): entries = set() for name, affiliation in X[labels == cluster]: entries.add((name, affiliation)) print("Cluster #%d = %s" % (cluster, entries)) print()
def clustering(input_signatures, input_records, distance_model, input_clusters=None, output_clusters=None, verbose=1, n_jobs=-1, clustering_method="average", train_signatures_file=None, clustering_threshold=None, results_file=None, blocking_function="block_phonetic", blocking_threshold=1, blocking_phonetic_alg="nysiis"): """Cluster signatures using a pretrained distance model. Parameters ---------- :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be a pickle created using the ``distance.py`` script. :param input_clusters: string Path to the file with knownn clusters. The file should be a dictionary, where keys are cluster labels and values are the `signature_id` of the signatures grouped in the clusters. Signatures assigned to the cluster with label "-1" are not clustered. {"0": [0, 1, 3], "1": [2, 5], ...} :param output_clusters: string Path to the file with output cluster. The file will be filled with clusters, using the same format as ``input_clusters``. :param verbose: int If not zero, function will output scores on stdout. :param n_jobs: int Parameter passed to joblib. Number of threads to be used. :param clustering_method: string Parameter passed to ``ScipyHierarchicalClustering``. Used only if ``clustering_test_size`` is specified. :param train_signatures_file: str Path to the file with train set signatures. Format the same as in ``input_signatures``. :param clustering_threshold: float Threshold passed to ``ScipyHierarchicalClustering``. :param results_file: str Path to the file where the results will be output. It will give additional information about pairwise variant of scores. :param blocking_function: string must be a defined blocking function. Defined functions are: - "block_last_name_first_initial" - "block_phonetic" :param blocking_threshold: int or None It determines the maximum allowed size of blocking on the last name It can only be: - None; if the blocking function is block_last_name_first_initial - int; if the blocking function is block_phonetic please check the documentation of phonetic blocking in beard.clustering.blocking_funcs.py :param blocking_phonetic_alg: string or None If not None, determines which phonetic algorithm is used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) """ # Assumes that 'distance_estimator' lives in global, making things fast global distance_estimator distance_estimator = pickle.load(open(distance_model, "rb")) try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except: pass signatures, records = load_signatures(input_signatures, input_records) indices = {} X = np.empty((len(signatures), 1), dtype=np.object) for i, signature in enumerate( sorted(signatures.values(), key=lambda s: s["signature_id"])): X[i, 0] = signature indices[signature["signature_id"]] = i if blocking_function == "block_last_name_first_initial": block_function = block_last_name_first_initial else: block_function = partial(block_phonetic, threshold=blocking_threshold, phonetic_algorithm=blocking_phonetic_alg) # Semi-supervised block clustering if input_clusters: true_clusters = json.load(open(input_clusters, "r")) y_true = -np.ones(len(X), dtype=np.int) for label, signature_ids in true_clusters.items(): for signature_id in signature_ids: y_true[indices[signature_id]] = label y = -np.ones(len(X), dtype=np.int) if train_signatures_file: train_signatures = json.load(open(train_signatures_file, "r")) train_ids = [x['signature_id'] for x in train_signatures] del train_signatures y[train_ids] = y_true[train_ids] test_ids = list( set([x['signature_id'] for _, x in signatures.iteritems()]) - set(train_ids)) else: y = y_true else: y = None clusterer = BlockClustering(blocking=block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=clustering_threshold, method=clustering_method, supervised_scoring=b3_f_score), verbose=verbose, n_jobs=n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(output_clusters, "w")) # Statistics if verbose and input_clusters: print("Number of blocks =", len(clusterer.clusterers_)) print("True number of clusters", len(np.unique(y_true))) print("Number of computed clusters", len(np.unique(labels))) b3_overall = b3_precision_recall_fscore(y_true, labels) print("B^3 F-score (overall) =", b3_overall[2]) if train_signatures_file: b3_train = b3_precision_recall_fscore(y_true[train_ids], labels[train_ids]) b3_test = b3_precision_recall_fscore(y_true[test_ids], labels[test_ids]) print("B^3 F-score (train) =", b3_train[2]) print("B^3 F-score (test) =", b3_test[2]) if results_file: paired_overall = paired_precision_recall_fscore(y_true, labels) paired_train = paired_precision_recall_fscore( y_true[train_ids], labels[train_ids]) paired_test = paired_precision_recall_fscore( y_true[test_ids], labels[test_ids]) json.dump( { "description": ["precision", "recall", "f_score"], "b3": { "overall": list(b3_overall), "train": list(b3_train), "test": list(b3_test) }, "paired": { "overall": list(paired_overall), "train": list(paired_train), "test": list(paired_test) } }, open(results_file, 'w'))