def test_onthefly_labels(): clusterer = BlockClustering(base_estimator=ScipyHierarchicalClustering( n_clusters=1, method="complete")) clusterer.fit(X) assert_array_equal([100], np.bincount(clusterer.labels_)) clusterer.clusterers_[0].set_params(n_clusters=4) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_onthefly_labels(): clusterer = BlockClustering( base_estimator=ScipyHierarchicalClustering(n_clusters=1, method="complete")) clusterer.fit(X) assert_array_equal([100], np.bincount(clusterer.labels_)) clusterer.clusterers_[0].set_params(n_clusters=4) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm='nysiis') self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = 'average' def load_data(self, signatures_path, publications_path, input_clusters_path): signatures_by_uuid = load_signatures(signatures_path, publications_path) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 with open(input_clusters_path, 'r') as fd: for line in fd: cluster = json.loads(line) for signature_uuid in cluster['signature_uuids']: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster['cluster_id'] i += 1 def load_model(self, input_filename): with open(input_filename, 'rb') as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): with open_file_in_folder(output_filename, 'wb') as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score), n_jobs=n_jobs, verbose=True) self.clusterer.fit(self.X, self.y)
def test_predict(): """Test predict.""" clusterer = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X, blocks=(y <= 1)) pred = clusterer.predict(X, blocks=(y <= 1)) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) pred = clusterer.predict(X, blocks=10 * np.ones(len(X))) assert_array_equal(-np.ones(len(X)), pred)
def test_partial_fit(): """Test partial_fit.""" blocks = (y <= 1) clusterer1 = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer1.partial_fit(X[y <= 1], blocks=blocks[y <= 1]) assert_equal(len(clusterer1.clusterers_), 1) clusterer1.partial_fit(X[y > 1], blocks=blocks[y > 1]) assert_equal(len(clusterer1.clusterers_), 2) clusterer2 = BlockClustering(blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer2.fit(X, blocks=blocks) c1 = clusterer1.predict(X, blocks=blocks) c2 = clusterer2.labels_ assert_equal(paired_f_score(c1, c2), 1.0)
def test_fit(n_jobs): """Test fit.""" # Single block clusterer = BlockClustering( blocking="single", base_estimator=AgglomerativeClustering(n_clusters=4, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X) assert_equal(len(clusterer.clusterers_), 1) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed blocks clusterer = BlockClustering( blocking="precomputed", base_estimator=AgglomerativeClustering(n_clusters=2, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed affinity clusterer = BlockClustering( affinity="precomputed", blocking="precomputed", base_estimator=ScipyHierarchicalClustering(affinity="precomputed", n_clusters=2, method="complete"), n_jobs=n_jobs) X_affinity = euclidean_distances(X) clusterer.fit(X_affinity, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Custom blocking function X_ids = np.arange(len(X)).reshape((-1, 1)) def _blocking(X_ids): return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} clusterer = BlockClustering( blocking=_blocking, base_estimator=AgglomerativeClustering(n_clusters=2, linkage="complete", affinity=_distance)) clusterer.fit(X_ids) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_fit(n_jobs): """Test fit.""" # Single block clusterer = BlockClustering(blocking="single", base_estimator=AgglomerativeClustering( n_clusters=4, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X) assert_equal(len(clusterer.clusterers_), 1) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed blocks clusterer = BlockClustering(blocking="precomputed", base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete"), n_jobs=n_jobs) clusterer.fit(X, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Precomputed affinity clusterer = BlockClustering(affinity="precomputed", blocking="precomputed", base_estimator=ScipyHierarchicalClustering( affinity="precomputed", n_clusters=2, method="complete"), n_jobs=n_jobs) X_affinity = euclidean_distances(X) clusterer.fit(X_affinity, blocks=(y <= 1)) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_)) # Custom blocking function X_ids = np.arange(len(X)).reshape((-1, 1)) def _blocking(X_ids): return y[X_ids.ravel()] <= 1 # block labels into {0,1} and {2,3} clusterer = BlockClustering(blocking=_blocking, base_estimator=AgglomerativeClustering( n_clusters=2, linkage="complete", affinity=_distance)) clusterer.fit(X_ids) assert_equal(len(clusterer.clusterers_), 2) assert_array_equal([25, 25, 25, 25], np.bincount(clusterer.labels_))
def test_validation(): """Test the validation of hyper-parameters and input data.""" with pytest.raises(ValueError): clusterer = BlockClustering( blocking="foobar", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X) with pytest.raises(ValueError): clusterer = BlockClustering( blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X) with pytest.raises(ValueError): clusterer = BlockClustering( blocking="precomputed", base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(X, blocks=(y <= 1)) clusterer.predict(X)
if __name__ == "__main__": # Load data data = np.load("data/author-disambiguation.npz") X = data["X"] truth = data["y"] # Block clustering with fixed threshold block_clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering( threshold=0.5, affinity=affinity, method="complete"), verbose=3, n_jobs=-1) block_clusterer.fit(X) labels = block_clusterer.labels_ # Print clusters for cluster in np.unique(labels): entries = set() for name, affiliation in X[labels == cluster]: entries.add((name, affiliation)) print("Cluster #%d = %s" % (cluster, entries)) print() # Statistics print("Number of blocks =", len(block_clusterer.clusterers_)) print("True number of clusters", len(np.unique(truth)))
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks # into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm="nysiis") self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = "average" def load_data(self, signatures, input_clusters): """Loads data to the estimator vectors Args: signatures (iterable): Signatures which should be processed input_clusters (iterable): Input clusters built for provided signatures see: `inspire_disambiguation.core.es.readers.get_input_clusters` """ signatures_by_uuid = load_signatures(signatures) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 for cluster in input_clusters: for signature_uuid in cluster["signature_uuids"]: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster["cluster_id"] i += 1 def load_model(self, input_filename): """Loads model dumped by pickle Args: input_filename (str): path to file with dumped ethnicity model. """ with open(input_filename, "rb") as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): """Dump object to a file Args: output_filename (str): Path where model will be dumped """ with open_file_in_folder(output_filename, "wb") as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): """Fit data using the estimator""" self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score, ), n_jobs=n_jobs, verbose=True, ) self.clusterer.fit(self.X, self.y)
def test_single_signature(n_jobs): """Test clustering of a single signature.""" import numbers clusterer = BlockClustering(base_estimator=MiniBatchKMeans(n_clusters=2)) clusterer.fit(np.array([X[0]])) assert isinstance(clusterer.predict(X[0])[0], numbers.Integral)
# Load data data = np.load("data/author-disambiguation.npz") X = data["X"] truth = data["y"] print("hello") # Block clustering with fixed threshold block_clusterer = BlockClustering( blocking=block_last_name_first_initial, base_estimator=ScipyHierarchicalClustering(threshold=0.5, affinity=affinity, method="complete"), verbose=3, n_jobs=-1) block_clusterer.fit(X) labels = block_clusterer.labels_ # Print clusters for cluster in np.unique(labels): entries = set() for name, affiliation in X[labels == cluster]: entries.add((name, affiliation)) print("Cluster #%d = %s" % (cluster, entries)) print() # Statistics print("Number of blocks =", len(block_clusterer.clusterers_)) print("True number of clusters", len(np.unique(truth)))
class Clusterer(object): def __init__(self, estimator): # TODO get rid of this global global distance_estimator distance_estimator = estimator.distance_estimator try: distance_estimator.steps[-1][1].set_params(n_jobs=1) except Exception: pass # threshold determines when to split blocks # into smaller ones adding first initial self.block_function = partial(block_phonetic, threshold=0, phonetic_algorithm="nysiis") self.clustering_threshold = 0.709 # magic value taken from BEARD example self.clustering_method = "average" def load_data(self, signatures, input_clusters): """Loads data to the estimator vectors Args: signatures (iterable): Signatures which should be processed input_clusters (iterable): Input clusters built for provided signatures see: `inspire_disambiguation.core.es.readers.get_input_clusters` """ signatures_by_uuid = load_signatures(signatures) self.X = np.empty((len(signatures_by_uuid), 1), dtype=np.object) self.y = -np.ones(len(self.X), dtype=np.int) i = 0 for cluster in input_clusters: for signature_uuid in cluster["signature_uuids"]: if signature_uuid not in signatures_by_uuid: continue # TODO figure out how this can happen self.X[i, 0] = signatures_by_uuid[signature_uuid] self.y[i] = cluster["cluster_id"] i += 1 def load_model(self, input_filename): """Loads model dumped by pickle Args: input_filename (str): path to file with dumped ethnicity model. """ with open(input_filename, "rb") as fd: self.clusterer = pickle.load(fd) def save_model(self, output_filename): """Dump object to a file Args: output_filename (str): Path where model will be dumped """ with open_file_in_folder(output_filename, "wb") as fd: pickle.dump(self.clusterer, fd, protocol=pickle.HIGHEST_PROTOCOL) def fit(self, n_jobs=8): """Fit data using the estimator""" self.clusterer = BlockClustering( blocking=self.block_function, base_estimator=ScipyHierarchicalClustering( affinity=_affinity, threshold=self.clustering_threshold, method=self.clustering_method, supervised_scoring=b3_f_score, ), n_jobs=n_jobs, verbose=True, ) self.clusterer.fit(self.X, self.y) def prepare_test_data(self, test_uuids, labels): """ Returns the arrays used for scoring training and test datasets Args: test_uuids - set of signatures uuids used for testing labels - list of labels (author id) for test dataset """ all_uuids = np.vectorize(lambda x: x.signature_uuid)(self.X).flatten() test_uuids_array = np.array(list(test_uuids)) mask = np.isin(all_uuids, test_uuids_array) y_train = self.y[~mask] y_test = np.array(labels) labels_train = self.clusterer.labels_[~mask] labels_test = self.clusterer.labels_[mask] return labels_train, y_train, labels_test, y_test def nb_of_clusters_predicted_for_author( self, input_clusters_with_all_author_labels, test_signature_authors_ids): author_ids = np.array([sample[0]['author_id'] for sample in self.X]) author_ids[author_ids == None] = test_signature_authors_ids signatures_per_author = { cluster['author_id']: set(cluster['signature_uuids']) for cluster in input_clusters_with_all_author_labels } nb_of_clusters_per_author = {} for author_id in signatures_per_author.keys(): author_mask = author_ids == author_id signatures_predicted_in_one_cluster = self.clusterer.labels_[ author_mask] nb_of_clusters_per_author[author_id] = np.unique( signatures_predicted_in_one_cluster).size return nb_of_clusters_per_author def score(self, labels_train, y_train, labels_test, y_test): """ Return the clustering statistics (b3 precision, b3 recall, b3 f1 score) and wrongly clustered samples for training, test and the whole dataset. Args: labels_train - array of labels predicted for training set y_train - array of true labels for training set labels_train - array of labels predicted for test set y_test - array of true labels for test set """ return (b3_precision_recall_fscore(self.y, self.clusterer.labels_), b3_precision_recall_fscore(y_train, labels_train) if labels_train.size != 0 else None, b3_precision_recall_fscore(y_test, labels_test) if labels_test.size != 0 else None)