def train_and_save_distance_model(ethnicity_model_path, save_distance_model_path, sampled_pairs_size): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 12. """ LOGGER.info("Pulling training data from ES") curated_signatures = get_signatures(only_curated=True) input_clusters = get_input_clusters(curated_signatures) LOGGER.info("Preparing %s pairs from sampled data for training.", sampled_pairs_size) pairs = list( sample_signature_pairs(curated_signatures, input_clusters, sampled_pairs_size)) ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size) LOGGER.info("Training DistanceEstimator...") distance_estimator.fit() distance_estimator.save_model(save_distance_model_path)
def cluster(ethnicity_model_path, distance_model_path, n_jobs, signature_block=None): """Train the clustering model and process the output. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. distance_model_path (str): Full path where distance model is saved. n_jobs (int): Number of processes to use. signature_block (str): Signature block indicating which block should be clustered. If set to None, clustering will run on all blocks. """ LOGGER.info("Pulling signatures for block '%s' from ES", signature_block) signatures = get_signatures(signature_block=signature_block) input_clusters = get_input_clusters(signatures) LOGGER.debug( "Got %s signature_blocks and %s input_clusters", len(signatures), len(input_clusters), ) distance_estimator = DistanceEstimator.get(ethnicity_model_path, distance_model_path) clusterer = Clusterer(distance_estimator) clusterer.load_data(signatures, input_clusters) LOGGER.info("Starting clustering") clusterer.fit(n_jobs=n_jobs) return process_clustering_output(clusterer)
def cluster( ethnicity_model_path, distance_model_path, n_jobs, signature_block=None, ): """Train the clustering model and process the output. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. distance_model_path (str): Full path where distance model is saved. n_jobs (int): Number of processes to use. signature_block (str): Signature block indicating which block should be clustered. If set to None, clustering will run on all blocks. """ start_time = datetime.now() LOGGER.info("Preparing test dataset...") signatures = get_signatures(signature_block=signature_block) input_clusters = get_input_clusters(signatures) LOGGER.info( "Input data", signature_block=signature_block, signatures_count=len(signatures), curated_signatures_count=len( [sig for sig in signatures if sig.get("is_curated_author_id")]), input_clusters_count=len(input_clusters), input_clusters=input_clusters, ) load_data_time = datetime.now() distance_estimator = DistanceEstimator.get(ethnicity_model_path, distance_model_path) clusterer = Clusterer(distance_estimator) clusterer.load_data(signatures, input_clusters) prepare_clusterer_time = datetime.now() LOGGER.info("Clustering", signature_block=signature_block) clusterer.fit(n_jobs=n_jobs) fit_time = datetime.now() for phonetic_block, cluster in clusterer.clusterer.clusterers_.items(): LOGGER.info( "Clustering stats", load_data_runtime=str(load_data_time - start_time), prepare_clusterer_runtime=str(prepare_clusterer_time - load_data_time), clustering_runtime=str(fit_time - prepare_clusterer_time), total_runtime=str(fit_time - start_time), threshold=getattr(cluster, "best_threshold_", clusterer.clusterer.base_estimator.threshold), signature_block=phonetic_block, B3_f_score=cluster.supervised_scoring(clusterer.y, cluster.labels_) if hasattr(cluster, "supervised_scoring") else None, ) return process_clustering_output(clusterer)
def train_and_save_distance_model(ethnicity_model_path, save_distance_model_path, sampled_pairs_size): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 12. """ LOGGER.info("Pulling training data from ES") start_time = datetime.now() curated_signatures = get_signatures(only_curated=True) input_clusters = get_input_clusters(curated_signatures) prepare_intput_time = datetime.now() LOGGER.info( "Preparing pairs from sampled data for training.", pairs_count=sampled_pairs_size, ) pairs = list( sample_signature_pairs(curated_signatures, input_clusters, sampled_pairs_size)) prepare_pairs_time = datetime.now() ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) prepare_estimators_time = datetime.now() distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size) load_data_to_model_time = datetime.now() LOGGER.info("Training DistanceEstimator...") distance_estimator.fit() training_model_time = datetime.now() distance_estimator.save_model(save_distance_model_path) save_model_time = datetime.now() LOGGER.info( "Train distance model", prepare_input_runtime=str(prepare_intput_time - start_time), prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time), prepare_estimators_runtime=str(prepare_estimators_time - prepare_pairs_time), load_data_runtime=str(load_data_to_model_time - prepare_estimators_time), training_model_runtime=str(training_model_time - load_data_to_model_time), save_model_runtime=str(save_model_time - training_model_time), total_runtime=str(save_model_time - start_time), )
def train_and_save_distance_model( ethnicity_model_path, save_distance_model_path, sampled_pairs_size, train_to_validation_split_fraction=0.8, ): """Train the distance estimator model and save it to disk. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. save_distance_model_path (str): Full path where trained distance model will be saved. sampled_pairs_size (int): Number of pairs to be generated for the training. Note: Must be multiple of 4. train_to_validation_split_fraction (float): fraction of the data used for training. """ start_time = datetime.now() curated_signatures = get_signatures(only_curated=True) LOGGER.info( "Splitting data into training and test set.", training_set_fraction=train_to_validation_split_fraction, ) train_signatures_dict, test_signatures_dict = train_validation_split( curated_signatures, train_to_validation_split_fraction) train_signatures_list = train_signatures_dict.values() test_signatures_list = test_signatures_dict.values() input_clusters_train = get_input_clusters(train_signatures_list) input_clusters_test = get_input_clusters(test_signatures_list) prepare_intput_time = datetime.now() LOGGER.info( "Preparing pairs from sampled data for training.", pairs_count=sampled_pairs_size, ) pairs_train = list( sample_signature_pairs(train_signatures_list, input_clusters_train, sampled_pairs_size)) prepare_pairs_time = datetime.now() # must be multiple of 4 pair_size_test = 4 * math.ceil( (((1 - train_to_validation_split_fraction) / train_to_validation_split_fraction)**2 * sampled_pairs_size) / 4) pairs_test = list( sample_signature_pairs(test_signatures_list, input_clusters_test, pair_size_test)) LOGGER.info( "Pairs prepared.", n_training_pairs=len(pairs_train), n_test_pairs=len(pairs_test), ) ethnicity_estimator = EthnicityEstimator(ethnicity_model_path) distance_estimator = DistanceEstimator(ethnicity_estimator) prepare_estimators_time = datetime.now() distance_estimator.load_data(train_signatures_list, pairs_train, sampled_pairs_size) load_data_to_model_time = datetime.now() distance_estimator.fit() training_model_time = datetime.now() distance_estimator.save_model(save_distance_model_path) save_model_time = datetime.now() distance_estimator.load_data(test_signatures_list, pairs_test, pair_size_test) test_score = distance_estimator.score() LOGGER.info( "Train distance model", prepare_input_runtime=str(prepare_intput_time - start_time), prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time), prepare_estimators_runtime=str(prepare_estimators_time - prepare_pairs_time), load_data_runtime=str(load_data_to_model_time - prepare_estimators_time), training_model_runtime=str(training_model_time - load_data_to_model_time), save_model_runtime=str(save_model_time - training_model_time), total_runtime=str(save_model_time - start_time), test_score=str(test_score), ) return set(test_signatures_dict)
def cluster_with_evaluation( ethnicity_model_path, distance_model_path, n_jobs, test_signatures_uuids=None, ): """Train the clustering model and process the output. Args: ethnicity_model_path (str): Full path where ethnicity model is saved. distance_model_path (str): Full path where distance model is saved. n_jobs (int): Number of processes to use. signature_block (str): Signature block indicating which block should be clustered. If set to None, clustering will run on all blocks. test_signatures_uuids (set): Signature uuids which will be used for model validation. """ start_time = datetime.now() signature_blocks = get_curated_signature_blocks() labels_train, labels_test, y_train, y_test = ( np.array([]), np.array([]), np.array([]), np.array([]), ) statistics_names = ("precision", "recall", "f1") for clustered_blocks, block in enumerate(signature_blocks, 1): LOGGER.info("Clustering a new block", current=clustered_blocks, total=(len(signature_blocks) + 1)) test_signatures = [] test_authors_ids = [] signatures = get_signatures(signature_block=block, only_curated=True) input_clusters_with_all_labels = get_input_clusters(signatures) for signature in signatures: if signature.signature_uuid in test_signatures_uuids: test_authors_ids.append(signature.author_id) signature.author_id = None test_signatures.append(signature.signature_uuid) input_clusters = get_input_clusters(signatures) test_labels = [] for cluster in input_clusters: for signature in cluster["signature_uuids"]: if signature in test_signatures: test_labels.append(cluster["cluster_id"]) LOGGER.info( "Input data", signature_block=block, signatures_count=len(signatures), input_clusters_count=len(input_clusters), input_clusters=input_clusters, ) load_data_time = datetime.now() distance_estimator = DistanceEstimator.get(ethnicity_model_path, distance_model_path) clusterer = Clusterer(distance_estimator) clusterer.load_data(signatures, input_clusters) prepare_clusterer_time = datetime.now() LOGGER.info("Clustering", signature_block=block) clusterer.fit(n_jobs=n_jobs) fit_time = datetime.now() for phonetic_block, cluster in clusterer.clusterer.clusterers_.items(): LOGGER.info( "Clustering stats", load_data_runtime=str(load_data_time - start_time), prepare_clusterer_runtime=str(prepare_clusterer_time - load_data_time), clustering_runtime=str(fit_time - prepare_clusterer_time), total_runtime=str(fit_time - start_time), threshold=getattr( cluster, "best_threshold_", clusterer.clusterer.base_estimator.threshold, ), signature_block=phonetic_block, ) ( labels_train_per_block, y_train_per_block, labels_test_per_block, y_test_per_block, ) = clusterer.prepare_test_data(test_signatures_uuids, test_labels) ( B3_statistics_all_per_block, B3_statistics_training_per_block, B3_statistics_test_per_block, ) = clusterer.score( labels_train_per_block, y_train_per_block, labels_test_per_block, y_test_per_block, ) nb_of_clusters_per_author = clusterer.nb_of_clusters_predicted_for_author( input_clusters_with_all_labels, test_authors_ids) LOGGER.info( "Clustering results for block {}".format(block), train_dataset_size=y_train_per_block.size, test_dataset_size=y_test_per_block.size, true_number_of_clusters=np.unique(clusterer.y).size, predicted_number_of_clusters=np.unique( clusterer.clusterer.labels_).size, B3_precision_recall_f_score_all=dict( zip(statistics_names, B3_statistics_all_per_block)), B3_precision_recall_f_score_training=dict( zip(statistics_names, B3_statistics_training_per_block)) if B3_statistics_training_per_block else None, B3_precision_recall_f_score_test=dict( zip(statistics_names, B3_statistics_test_per_block)) if B3_statistics_test_per_block else None, nb_of_clusters_per_author=nb_of_clusters_per_author) labels_train = np.concatenate( (labels_train, labels_train_per_block)) y_train = np.concatenate((y_train, y_train_per_block)) labels_test = np.concatenate((labels_test, labels_test_per_block)) y_test = np.concatenate((y_test, y_test_per_block)) B3_statistics_training = b3_precision_recall_fscore(y_train, labels_train) B3_statistics_test = b3_precision_recall_fscore(y_test, labels_test) B3_statistics_all = b3_precision_recall_fscore( np.append(y_train, y_test), np.append(labels_train, labels_test)) LOGGER.info( "Clustering results for all the blocks", B3_precision_recall_f_score_all=B3_statistics_all, B3_statistics_training=B3_statistics_training, B3_statistics_test=B3_statistics_test, )
def test_distance_estimator_load_data(scan_mock, es_record_with_many_curated_authors): scan_mock.side_effect = [[es_record_with_many_curated_authors]] signatures = get_signatures() pairs = [ { "same_cluster": True, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e52", "94fc2b0a-dc17-42c2-bae3-ca0024079e53", ], }, { "same_cluster": True, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e54", "94fc2b0a-dc17-42c2-bae3-ca0024079e55", ], }, { "same_cluster": False, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e56", "94fc2b0a-dc17-42c2-bae3-ca0024079e57", ], }, { "same_cluster": False, "signature_uuids": [ "94fc2b0a-dc17-42c2-bae3-ca0024079e52", "94fc2b0a-dc17-42c2-bae3-ca0024079e54", ], }, ] distance_estimator = DistanceEstimator(None) distance_estimator.load_data(signatures, pairs, 4) expected_X = array( [ [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True, ), Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, J", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e53", is_curated_author_id=True), ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55", is_curated_author_id=True), ], [ Signature( author_affiliation="", author_id=6, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e56", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=7, author_name="Jamie", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="Jana", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e57", is_curated_author_id=True), ], [ Signature( author_affiliation="Rutgers U., Piscataway", author_id=1, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52", is_curated_author_id=True), Signature( author_affiliation="Rutgers U., Piscataway", author_id=2, author_name="Doe, John", publication=Publication( abstract="Many curated authors", authors=[ "Doe, John", "Doe, J", "Doe, John", "Doe, John", "Doe, John", "Doe, John", "Jamie", "Jamie", ], collaborations=[], keywords=["keyword"], publication_id=1, title="Title", topics=["category"], ), signature_block="JOhn", signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54", is_curated_author_id=True), ], ], dtype=object, ) expected_y = array([0, 0, 1, 1]) assert (distance_estimator.X == expected_X).all() assert (distance_estimator.y == expected_y).all()