def test_onthefly_labels(): clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering( n_clusters=1, method="complete")) clusterer.fit(X) assert_array_equal([100], np.bincount(clusterer.labels_)) clusterer.clusterers_[0].set_params(n_clusters=4) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_onthefly_labels(): clusterer = BlockClustering( base_estimator=ScipyHierarchicalClustering(n_clusters=1, method="complete")) clusterer.fit(X) assert_array_equal([100], np.bincount(clusterer.labels_)) clusterer.clusterers_[0].set_params(n_clusters=4) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm='nysiis') self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = 'average' def load_data(self, signatures_path, publications_path, input_clusters_path): signatures_by_uuid = load_signatures(signatures_path, publications_path) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 with open(input_clusters_path, 'r') as fd: for line in fd: cluster = json.loads(line) for signature_uuid in cluster['signature_uuids']: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster['cluster_id'] i += 1 def load_model(self, input_filename): with open(input_filename, 'rb') as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): with open_file_in_folder(output_filename, 'wb') as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score), n_jobs=n_jobs, verbose=True) self.clusterer.fit(self.X, self.y)
def test_predict(): """Test predict.""" clusterer = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X, blocks=(y <= 1)) pred = clusterer.predict(X, blocks=(y <= 1)) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) pred = clusterer.predict(X, blocks=10 * np.ones(len(X))) assert_array_equal(-np.ones(len(X)), pred)
def fit(self, n_jobs=8): self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score), n_jobs=n_jobs, verbose=True) self.clusterer.fit(self.X, self.y)
def test_partial_fit(): """Test partial_fit.""" blocks = (y <= 1) clusterer1 = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1]) assert_equal(len(clusterer1.clusterers_), 1) clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1]) assert_equal(len(clusterer1.clusterers_), 2) clusterer2 = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer2.fit(X, blocks=blocks) c1 = clusterer1.predict(X, blocks=blocks) c2 = clusterer2.labels_ assert_equal(paired_f_score(c1, c2), 1.0)
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks # into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm="nysiis") self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = "average" def load_data(self, signatures, input_clusters): """Loads data to the estimator vectors Args: signatures (iterable): Signatures which should be processed input_clusters (iterable): Input clusters built for provided signatures see: `inspire_disambiguation.core.es.readers.get_input_clusters` """ signatures_by_uuid = load_signatures(signatures) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 for cluster in input_clusters: for signature_uuid in cluster["signature_uuids"]: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster["cluster_id"] i += 1 def load_model(self, input_filename): """Loads model dumped by pickle Args: input_filename (str): path to file with dumped ethnicity model. """ with open(input_filename, "rb") as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): """Dump object to a file Args: output_filename (str): Path where model will be dumped """ with open_file_in_folder(output_filename, "wb") as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): """Fit data using the estimator""" self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score, ), n_jobs=n_jobs, verbose=True, ) self.clusterer.fit(self.X, self.y)
def test_fit(n_jobs): """Test fit.""" # Single block clusterer = BlockClustering(blocking="single", base_estimator=AgglomerativeClustering( n_clusters=4, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X) assert_equal(len(clusterer.clusterers_), 1) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed blocks clusterer = BlockClustering(blocking="precomputed", base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed affinity clusterer = BlockClustering(affinity="precomputed", blocking="precomputed", base_estimator=ScipyHierarchicalClustering( affinity="precomputed", n_clusters=2, method="complete"), n_jobs=n_jobs) X_affinity = euclidean_distances(X) clusterer.fit(X_affinity, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Custom blocking function X_ids = np.arange(len(X)).reshape((-1, 1)) def _blocking(X_ids): return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} clusterer = BlockClustering(blocking=_blocking, base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete", affinity=_distance)) clusterer.fit(X_ids) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def clustering(input_signatures, input_records, distance_model, input_clusters=None, output_clusters=None, verbose=1, n_jobs=-1, clustering_method="average", train_signatures_file=None, clustering_threshold=None, results_file=None, blocking_function="block_phonetic", blocking_threshold=1, blocking_phonetic_alg="nysiis"): """Cluster signatures using a pretrained distance model. Parameters ---------- :param input_signatures: string Path to the file with signatures. The content should be a JSON array of dictionaries holding metadata about signatures. [{"signature_id": 0, "author_name": "Doe, John", "publication_id": 10, ...}, { ... }, ...] :param input_records: string Path to the file with records. The content should be a JSON array of dictionaries holding metadata about records [{"publication_id": 0, "title": "Author disambiguation using Beard", ... }, { ... }, ...] :param distance_model: string Path to the file with the distance model. The file should be a pickle created using the ``distance.py`` script. :param input_clusters: string Path to the file with knownn clusters. The file should be a dictionary, where keys are cluster labels and values are the `signature_id` of the signatures grouped in the clusters. Signatures assigned to the cluster with label "-1" are not clustered. {"0": [0, 1, 3], "1": [2, 5], ...} :param output_clusters: string Path to the file with output cluster. The file will be filled with clusters, using the same format as ``input_clusters``. :param verbose: int If not zero, function will output scores on stdout. :param n_jobs: int Parameter passed to joblib. Number of threads to be used. :param clustering_method: string Parameter passed to ``ScipyHierarchicalClustering``. Used only if ``clustering_test_size`` is specified. :param train_signatures_file: str Path to the file with train set signatures. Format the same as in ``input_signatures``. :param clustering_threshold: float Threshold passed to ``ScipyHierarchicalClustering``. :param results_file: str Path to the file where the results will be output. It will give additional information about pairwise variant of scores. :param blocking_function: string must be a defined blocking function. Defined functions are: - "block_last_name_first_initial" - "block_phonetic" :param blocking_threshold: int or None It determines the maximum allowed size of blocking on the last name It can only be: - None; if the blocking function is block_last_name_first_initial - int; if the blocking function is block_phonetic please check the documentation of phonetic blocking in beard.clustering.blocking_funcs.py :param blocking_phonetic_alg: string or None If not None, determines which phonetic algorithm is used. Options: - "double_metaphone" - "nysiis" (only for Python 2) - "soundex" (only for Python 2) """ # Assumes that 'distance_estimator' lives in global, making things fast global distance_estimator distance_estimator = pickle.load(open(distance_model, "rb")) try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except: pass signatures, records = load_signatures(input_signatures, input_records) indices = {} X = np.empty((len(signatures), 1), dtype=np.object) for i, signature in enumerate( sorted(signatures.values(), key=lambda s: s["signature_id"])): X[i, 0] = signature indices[signature["signature_id"]] = i if blocking_function == "block_last_name_first_initial": block_function = block_last_name_first_initial else: block_function = partial(block_phonetic, threshold=blocking_threshold, phonetic_algorithm=blocking_phonetic_alg) # Semi-supervised block clustering if input_clusters: true_clusters = json.load(open(input_clusters, "r")) y_true = -np.ones(len(X), dtype=np.int) for label, signature_ids in true_clusters.items(): for signature_id in signature_ids: y_true[indices[signature_id]] = label y = -np.ones(len(X), dtype=np.int) if train_signatures_file: train_signatures = json.load(open(train_signatures_file, "r")) train_ids = [x['signature_id'] for x in train_signatures] del train_signatures y[train_ids] = y_true[train_ids] test_ids = list( set([x['signature_id'] for _, x in signatures.iteritems()]) - set(train_ids)) else: y = y_true else: y = None clusterer = BlockClustering(blocking=block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=clustering_threshold, method=clustering_method, supervised_scoring=b3_f_score), verbose=verbose, n_jobs=n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[str(label)] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(output_clusters, "w")) # Statistics if verbose and input_clusters: print("Number of blocks =", len(clusterer.clusterers_)) print("True number of clusters", len(np.unique(y_true))) print("Number of computed clusters", len(np.unique(labels))) b3_overall = b3_precision_recall_fscore(y_true, labels) print("B^3 F-score (overall) =", b3_overall[2]) if train_signatures_file: b3_train = b3_precision_recall_fscore(y_true[train_ids], labels[train_ids]) b3_test = b3_precision_recall_fscore(y_true[test_ids], labels[test_ids]) print("B^3 F-score (train) =", b3_train[2]) print("B^3 F-score (test) =", b3_test[2]) if results_file: paired_overall = paired_precision_recall_fscore(y_true, labels) paired_train = paired_precision_recall_fscore( y_true[train_ids], labels[train_ids]) paired_test = paired_precision_recall_fscore( y_true[test_ids], labels[test_ids]) json.dump( { "description": ["precision", "recall", "f_score"], "b3": { "overall": list(b3_overall), "train": list(b3_train), "test": list(b3_test) }, "paired": { "overall": list(paired_overall), "train": list(paired_train), "test": list(paired_test) } }, open(results_file, 'w'))
def test_single_signature(n_jobs): """Test clustering of a single signature.""" import numbers clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(np.array([X[0]])) assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)
test_size=args.clustering_test_size, random_state=args.clustering_random_state) y = -np.ones(len(X), dtype=np.int) y[train] = y_true[train] else: y = y_true else: y = None clusterer = BlockClustering(blocking=blocking, base_estimator=ScipyHierarchicalClustering( affinity=affinity, threshold=args.clustering_threshold, method=args.clustering_method, scoring=b3_f_score), verbose=args.verbose, n_jobs=args.n_jobs).fit(X, y) labels = clusterer.labels_ # Save predicted clusters if args.output_clusters: clusters = {} for label in np.unique(labels): mask = (labels == label) clusters[label] = [r[0]["signature_id"] for r in X[mask]] json.dump(clusters, open(args.output_clusters, "w"))
def test_fit(n_jobs): """Test fit.""" # Single block clusterer = BlockClustering( blocking="single", base_estimator=AgglomerativeClustering(n_clusters=4, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X) assert_equal(len(clusterer.clusterers_), 1) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed blocks clusterer = BlockClustering( blocking="precomputed", base_estimator=AgglomerativeClustering(n_clusters=2, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed affinity clusterer = BlockClustering( affinity="precomputed", blocking="precomputed", base_estimator=ScipyHierarchicalClustering(affinity="precomputed", n_clusters=2, method="complete"), n_jobs=n_jobs) X_affinity = euclidean_distances(X) clusterer.fit(X_affinity, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Custom blocking function X_ids = np.arange(len(X)).reshape((-1, 1)) def _blocking(X_ids): return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} clusterer = BlockClustering( blocking=_blocking, base_estimator=AgglomerativeClustering(n_clusters=2, linkage="complete", affinity=_distance)) clusterer.fit(X_ids) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_validation(): """Test the validation of hyper-parameters and input data.""" with pytest.raises(ValueError): clusterer = BlockClustering( blocking="foobar", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X) with pytest.raises(ValueError): clusterer = BlockClustering( blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X) with pytest.raises(ValueError): clusterer = BlockClustering( blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X, blocks=(y <= 1)) clusterer.predict(X)
distances[i, j] = 0.5 distances += distances.T return distances if __name__ == "__main__": # Load data data = np.load("data/author-disambiguation.npz") X = data["X"] truth = data["y"] # Block clustering with fixed threshold block_clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering( threshold=0.5, affinity=affinity, method="complete"), verbose=3, n_jobs=-1) block_clusterer.fit(X) labels = block_clusterer.labels_ # Print clusters for cluster in np.unique(labels): entries = set() for name, affiliation in X[labels == cluster]: entries.add((name, affiliation)) print("Cluster #%d = %s" % (cluster, entries)) print()
return distances if __name__ == "__main__": # Load data data = np.load("data/author-disambiguation.npz") X = data["X"] truth = data["y"] print("hello") # Block clustering with fixed threshold block_clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering(threshold=0.5, affinity=affinity, method="complete"), verbose=3, n_jobs=-1) block_clusterer.fit(X) labels = block_clusterer.labels_ # Print clusters for cluster in np.unique(labels): entries = set() for name, affiliation in X[labels == cluster]: entries.add((name, affiliation)) print("Cluster #%d = %s" % (cluster, entries)) print()
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks # into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm="nysiis") self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = "average" def load_data(self, signatures, input_clusters): """Loads data to the estimator vectors Args: signatures (iterable): Signatures which should be processed input_clusters (iterable): Input clusters built for provided signatures see: `inspire_disambiguation.core.es.readers.get_input_clusters` """ signatures_by_uuid = load_signatures(signatures) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 for cluster in input_clusters: for signature_uuid in cluster["signature_uuids"]: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster["cluster_id"] i += 1 def load_model(self, input_filename): """Loads model dumped by pickle Args: input_filename (str): path to file with dumped ethnicity model. """ with open(input_filename, "rb") as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): """Dump object to a file Args: output_filename (str): Path where model will be dumped """ with open_file_in_folder(output_filename, "wb") as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): """Fit data using the estimator""" self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score, ), n_jobs=n_jobs, verbose=True, ) self.clusterer.fit(self.X, self.y) def prepare_test_data(self, test_uuids, labels): """ Returns the arrays used for scoring training and test datasets Args: test_uuids - set of signatures uuids used for testing labels - list of labels (author id) for test dataset """ all_uuids = np.vectorize(lambda x: x.signature_uuid)(self.X).flatten() test_uuids_array = np.array(list(test_uuids)) mask = np.isin(all_uuids, test_uuids_array) y_train = self.y[~mask] y_test = np.array(labels) labels_train = self.clusterer.labels_[~mask] labels_test = self.clusterer.labels_[mask] return labels_train, y_train, labels_test, y_test def nb_of_clusters_predicted_for_author( self, input_clusters_with_all_author_labels, test_signature_authors_ids): author_ids = np.array([sample[0]['author_id'] for sample in self.X]) author_ids[author_ids == None] = test_signature_authors_ids signatures_per_author = { cluster['author_id']: set(cluster['signature_uuids']) for cluster in input_clusters_with_all_author_labels } nb_of_clusters_per_author = {} for author_id in signatures_per_author.keys(): author_mask = author_ids == author_id signatures_predicted_in_one_cluster = self.clusterer.labels_[ author_mask] nb_of_clusters_per_author[author_id] = np.unique( signatures_predicted_in_one_cluster).size return nb_of_clusters_per_author def score(self, labels_train, y_train, labels_test, y_test): """ Return the clustering statistics (b3 precision, b3 recall, b3 f1 score) and wrongly clustered samples for training, test and the whole dataset. Args: labels_train - array of labels predicted for training set y_train - array of true labels for training set labels_train - array of labels predicted for test set y_test - array of true labels for test set """ return (b3_precision_recall_fscore(self.y, self.clusterer.labels_), b3_precision_recall_fscore(y_train, labels_train) if labels_train.size != 0 else None, b3_precision_recall_fscore(y_test, labels_test) if labels_test.size != 0 else None)