Example #1
0
def test_publication_cache_invalidated_after_get_signatures(
    scan_mock,
    es_record_with_2_curated_authors,
):
    scan_mock.side_effect = [[
        es_record_with_2_curated_authors,
    ]]
    get_signatures(only_curated=True)

    assert PublicationCache.cache == {}
Example #2
0
def cluster(ethnicity_model_path,
            distance_model_path,
            n_jobs,
            signature_block=None):
    """Train the clustering model and process the output.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        distance_model_path (str): Full path where distance model is saved.
        n_jobs (int): Number of processes to use.
        signature_block (str): Signature block indicating which block should be
            clustered. If set to None, clustering will run on all blocks.
    """
    LOGGER.info("Pulling signatures for block '%s' from ES", signature_block)

    signatures = get_signatures(signature_block=signature_block)
    input_clusters = get_input_clusters(signatures)
    LOGGER.debug(
        "Got %s signature_blocks and %s input_clusters",
        len(signatures),
        len(input_clusters),
    )
    distance_estimator = DistanceEstimator.get(ethnicity_model_path,
                                               distance_model_path)

    clusterer = Clusterer(distance_estimator)
    clusterer.load_data(signatures, input_clusters)
    LOGGER.info("Starting clustering")
    clusterer.fit(n_jobs=n_jobs)

    return process_clustering_output(clusterer)
Example #3
0
def train_and_save_distance_model(ethnicity_model_path,
                                  save_distance_model_path,
                                  sampled_pairs_size):
    """Train the distance estimator model and save it to disk.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        save_distance_model_path (str): Full path where trained distance model
            will be saved.
        sampled_pairs_size (int): Number of pairs to be generated for the training.
            Note:
                Must be multiple of 12.
    """
    LOGGER.info("Pulling training data from ES")
    curated_signatures = get_signatures(only_curated=True)
    input_clusters = get_input_clusters(curated_signatures)
    LOGGER.info("Preparing %s pairs from sampled data for training.",
                sampled_pairs_size)
    pairs = list(
        sample_signature_pairs(curated_signatures, input_clusters,
                               sampled_pairs_size))

    ethnicity_estimator = EthnicityEstimator(ethnicity_model_path)
    distance_estimator = DistanceEstimator(ethnicity_estimator)
    distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size)
    LOGGER.info("Training DistanceEstimator...")
    distance_estimator.fit()
    distance_estimator.save_model(save_distance_model_path)
Example #4
0
def test_get_input_clusters_for_signatures_with_different_author_id(
        scan_mock, es_record_with_2_curated_authors,
        es_record_with_curated_author):
    scan_mock.side_effect = [[
        es_record_with_2_curated_authors, es_record_with_curated_author
    ]]
    signatures = get_signatures()
    clusters = get_input_clusters(signatures)

    expected_clusters = [
        {
            "author_id":
            989440,
            "cluster_id":
            0,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e55",
            ],
        },
        {
            "author_id": 989441,
            "cluster_id": 1,
            "signature_uuids": ["94fc2b0a-dc17-42c2-bae3-ca0024079e52"],
        },
    ]
    assert clusters == expected_clusters
Example #5
0
def test_get_signatures_for_signature_block_only_curated(
        scan_mock, es_record_with_2_curated_authors,
        es_record_with_non_curated_author):
    scan_mock.side_effect = [[
        es_record_with_2_curated_authors, es_record_with_non_curated_author
    ]]
    signatures = get_signatures(signature_block="SABARGn", only_curated=True)
    expected_signatures = [
        Signature(
            author_affiliation="Rutgers U., Piscataway",
            author_id=989440,
            author_name="Seiberg, N.",
            publication=Publication(
                abstract="2 curated authors with recid",
                authors=["Seiberg, N.", "Jimmy"],
                collaborations=[],
                keywords=["effective action", "approximation: semiclassical"],
                publication_id=374836,
                title="Title",
                topics=["Theory-HEP"],
            ),
            signature_block="SABARGn",
            signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52",
            is_curated_author_id=True)
    ]

    assert sorted(signatures, key=itemgetter("signature_uuid")) == sorted(
        expected_signatures, key=itemgetter("signature_uuid"))
def test_sample_signature_pairs(scan_mock, random_choice,
                                es_record_with_many_curated_authors):
    scan_mock.side_effect = [[es_record_with_many_curated_authors]]
    signatures = get_signatures()
    clusters = get_input_clusters(signatures)
    choices = [
        # same cluster, different name
        ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e52"),
        "94fc2b0a-dc17-42c2-bae3-ca0024079e53",
        # same cluster, same name
        ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e54"),
        "94fc2b0a-dc17-42c2-bae3-ca0024079e55",
        # different cluster, different name
        ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e56"),
        "94fc2b0a-dc17-42c2-bae3-ca0024079e57",
        # different cluster, same name
        ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e52"),
        "94fc2b0a-dc17-42c2-bae3-ca0024079e54",
    ]
    random_choice.side_effect = choices
    pairs = [pair for pair in sample_signature_pairs(signatures, clusters, 4)]
    expected_pairs = [
        {
            "same_cluster":
            True,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e53",
            ],
        },
        {
            "same_cluster":
            True,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e54",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e55",
            ],
        },
        {
            "same_cluster":
            False,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e56",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e57",
            ],
        },
        {
            "same_cluster":
            False,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e54",
            ],
        },
    ]

    assert pairs == expected_pairs
Example #7
0
def cluster(
    ethnicity_model_path,
    distance_model_path,
    n_jobs,
    signature_block=None,
):
    """Train the clustering model and process the output.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        distance_model_path (str): Full path where distance model is saved.
        n_jobs (int): Number of processes to use.
        signature_block (str): Signature block indicating which block should be
            clustered. If set to None, clustering will run on all blocks.
    """
    start_time = datetime.now()
    LOGGER.info("Preparing test dataset...")
    signatures = get_signatures(signature_block=signature_block)
    input_clusters = get_input_clusters(signatures)
    LOGGER.info(
        "Input data",
        signature_block=signature_block,
        signatures_count=len(signatures),
        curated_signatures_count=len(
            [sig for sig in signatures if sig.get("is_curated_author_id")]),
        input_clusters_count=len(input_clusters),
        input_clusters=input_clusters,
    )
    load_data_time = datetime.now()

    distance_estimator = DistanceEstimator.get(ethnicity_model_path,
                                               distance_model_path)
    clusterer = Clusterer(distance_estimator)
    clusterer.load_data(signatures, input_clusters)
    prepare_clusterer_time = datetime.now()
    LOGGER.info("Clustering", signature_block=signature_block)
    clusterer.fit(n_jobs=n_jobs)
    fit_time = datetime.now()
    for phonetic_block, cluster in clusterer.clusterer.clusterers_.items():
        LOGGER.info(
            "Clustering stats",
            load_data_runtime=str(load_data_time - start_time),
            prepare_clusterer_runtime=str(prepare_clusterer_time -
                                          load_data_time),
            clustering_runtime=str(fit_time - prepare_clusterer_time),
            total_runtime=str(fit_time - start_time),
            threshold=getattr(cluster, "best_threshold_",
                              clusterer.clusterer.base_estimator.threshold),
            signature_block=phonetic_block,
            B3_f_score=cluster.supervised_scoring(clusterer.y, cluster.labels_)
            if hasattr(cluster, "supervised_scoring") else None,
        )
    return process_clustering_output(clusterer)
Example #8
0
def train_and_save_distance_model(ethnicity_model_path,
                                  save_distance_model_path,
                                  sampled_pairs_size):
    """Train the distance estimator model and save it to disk.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        save_distance_model_path (str): Full path where trained distance model
            will be saved.
        sampled_pairs_size (int): Number of pairs to be generated for the training.
            Note:
                Must be multiple of 12.
    """
    LOGGER.info("Pulling training data from ES")
    start_time = datetime.now()
    curated_signatures = get_signatures(only_curated=True)
    input_clusters = get_input_clusters(curated_signatures)
    prepare_intput_time = datetime.now()
    LOGGER.info(
        "Preparing pairs from sampled data for training.",
        pairs_count=sampled_pairs_size,
    )
    pairs = list(
        sample_signature_pairs(curated_signatures, input_clusters,
                               sampled_pairs_size))
    prepare_pairs_time = datetime.now()

    ethnicity_estimator = EthnicityEstimator(ethnicity_model_path)
    distance_estimator = DistanceEstimator(ethnicity_estimator)
    prepare_estimators_time = datetime.now()
    distance_estimator.load_data(curated_signatures, pairs, sampled_pairs_size)
    load_data_to_model_time = datetime.now()
    LOGGER.info("Training DistanceEstimator...")
    distance_estimator.fit()
    training_model_time = datetime.now()
    distance_estimator.save_model(save_distance_model_path)
    save_model_time = datetime.now()
    LOGGER.info(
        "Train distance model",
        prepare_input_runtime=str(prepare_intput_time - start_time),
        prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time),
        prepare_estimators_runtime=str(prepare_estimators_time -
                                       prepare_pairs_time),
        load_data_runtime=str(load_data_to_model_time -
                              prepare_estimators_time),
        training_model_runtime=str(training_model_time -
                                   load_data_to_model_time),
        save_model_runtime=str(save_model_time - training_model_time),
        total_runtime=str(save_model_time - start_time),
    )
def test_sample_signature_pairs_could_not_generate_pairs(
        scan_mock, random_choice, es_record_with_many_curated_authors):
    scan_mock.side_effect = [[es_record_with_many_curated_authors]]
    signatures = get_signatures()
    clusters = get_input_clusters(signatures)
    # it won't be able to generate the needed pairs
    # because it can only find the same cluster, different name
    choices = [
        ("JOhn", "94fc2b0a-dc17-42c2-bae3-ca0024079e52"),
        "94fc2b0a-dc17-42c2-bae3-ca0024079e53",
    ] * 64
    random_choice.side_effect = choices
    with pytest.raises(IncompleteSamplingError):
        [pair for pair in sample_signature_pairs(signatures, clusters, 4)]
Example #10
0
def test_get_input_clusters_for_non_curated_and_curated_signatures(
        scan_mock, es_record_with_non_curated_author,
        es_record_with_curated_author):
    scan_mock.side_effect = [[
        es_record_with_non_curated_author, es_record_with_curated_author
    ]]
    signatures = get_signatures()
    clusters = get_input_clusters(signatures)

    expected_clusters = [
        {
            "author_id": 989441,
            "cluster_id": 0,
            "signature_uuids": ["94fc2b0a-dc17-42c2-bae3-ca0024079e52"],
        },
        {
            "author_id": None,
            "cluster_id": -1,
            "signature_uuids": ["94fc2b0a-dc17-42c2-bae3-ca0024079e51"],
        },
    ]

    assert clusters == expected_clusters
Example #11
0
def train_and_save_distance_model(
    ethnicity_model_path,
    save_distance_model_path,
    sampled_pairs_size,
    train_to_validation_split_fraction=0.8,
):
    """Train the distance estimator model and save it to disk.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        save_distance_model_path (str): Full path where trained distance model
            will be saved.
        sampled_pairs_size (int): Number of pairs to be generated for the training.
            Note:
                Must be multiple of 4.
        train_to_validation_split_fraction (float): fraction of the data
            used for training.
    """
    start_time = datetime.now()
    curated_signatures = get_signatures(only_curated=True)
    LOGGER.info(
        "Splitting data into training and test set.",
        training_set_fraction=train_to_validation_split_fraction,
    )
    train_signatures_dict, test_signatures_dict = train_validation_split(
        curated_signatures, train_to_validation_split_fraction)
    train_signatures_list = train_signatures_dict.values()
    test_signatures_list = test_signatures_dict.values()
    input_clusters_train = get_input_clusters(train_signatures_list)
    input_clusters_test = get_input_clusters(test_signatures_list)
    prepare_intput_time = datetime.now()
    LOGGER.info(
        "Preparing pairs from sampled data for training.",
        pairs_count=sampled_pairs_size,
    )
    pairs_train = list(
        sample_signature_pairs(train_signatures_list, input_clusters_train,
                               sampled_pairs_size))
    prepare_pairs_time = datetime.now()
    # must be multiple of 4
    pair_size_test = 4 * math.ceil(
        (((1 - train_to_validation_split_fraction) /
          train_to_validation_split_fraction)**2 * sampled_pairs_size) / 4)
    pairs_test = list(
        sample_signature_pairs(test_signatures_list, input_clusters_test,
                               pair_size_test))
    LOGGER.info(
        "Pairs prepared.",
        n_training_pairs=len(pairs_train),
        n_test_pairs=len(pairs_test),
    )
    ethnicity_estimator = EthnicityEstimator(ethnicity_model_path)
    distance_estimator = DistanceEstimator(ethnicity_estimator)
    prepare_estimators_time = datetime.now()
    distance_estimator.load_data(train_signatures_list, pairs_train,
                                 sampled_pairs_size)
    load_data_to_model_time = datetime.now()
    distance_estimator.fit()
    training_model_time = datetime.now()
    distance_estimator.save_model(save_distance_model_path)
    save_model_time = datetime.now()
    distance_estimator.load_data(test_signatures_list, pairs_test,
                                 pair_size_test)
    test_score = distance_estimator.score()
    LOGGER.info(
        "Train distance model",
        prepare_input_runtime=str(prepare_intput_time - start_time),
        prepare_pairs_runtime=str(prepare_pairs_time - prepare_intput_time),
        prepare_estimators_runtime=str(prepare_estimators_time -
                                       prepare_pairs_time),
        load_data_runtime=str(load_data_to_model_time -
                              prepare_estimators_time),
        training_model_runtime=str(training_model_time -
                                   load_data_to_model_time),
        save_model_runtime=str(save_model_time - training_model_time),
        total_runtime=str(save_model_time - start_time),
        test_score=str(test_score),
    )
    return set(test_signatures_dict)
Example #12
0
def cluster_with_evaluation(
    ethnicity_model_path,
    distance_model_path,
    n_jobs,
    test_signatures_uuids=None,
):
    """Train the clustering model and process the output.

    Args:
        ethnicity_model_path (str): Full path where ethnicity model is saved.
        distance_model_path (str): Full path where distance model is saved.
        n_jobs (int): Number of processes to use.
        signature_block (str): Signature block indicating which block should be
            clustered. If set to None, clustering will run on all blocks.
        test_signatures_uuids (set): Signature uuids which will be used
        for model validation.
    """
    start_time = datetime.now()
    signature_blocks = get_curated_signature_blocks()
    labels_train, labels_test, y_train, y_test = (
        np.array([]),
        np.array([]),
        np.array([]),
        np.array([]),
    )
    statistics_names = ("precision", "recall", "f1")
    for clustered_blocks, block in enumerate(signature_blocks, 1):
        LOGGER.info("Clustering a new block",
                    current=clustered_blocks,
                    total=(len(signature_blocks) + 1))
        test_signatures = []
        test_authors_ids = []
        signatures = get_signatures(signature_block=block, only_curated=True)
        input_clusters_with_all_labels = get_input_clusters(signatures)
        for signature in signatures:
            if signature.signature_uuid in test_signatures_uuids:
                test_authors_ids.append(signature.author_id)
                signature.author_id = None
                test_signatures.append(signature.signature_uuid)
        input_clusters = get_input_clusters(signatures)
        test_labels = []
        for cluster in input_clusters:
            for signature in cluster["signature_uuids"]:
                if signature in test_signatures:
                    test_labels.append(cluster["cluster_id"])
        LOGGER.info(
            "Input data",
            signature_block=block,
            signatures_count=len(signatures),
            input_clusters_count=len(input_clusters),
            input_clusters=input_clusters,
        )
        load_data_time = datetime.now()

        distance_estimator = DistanceEstimator.get(ethnicity_model_path,
                                                   distance_model_path)
        clusterer = Clusterer(distance_estimator)
        clusterer.load_data(signatures, input_clusters)
        prepare_clusterer_time = datetime.now()
        LOGGER.info("Clustering", signature_block=block)
        clusterer.fit(n_jobs=n_jobs)
        fit_time = datetime.now()
        for phonetic_block, cluster in clusterer.clusterer.clusterers_.items():
            LOGGER.info(
                "Clustering stats",
                load_data_runtime=str(load_data_time - start_time),
                prepare_clusterer_runtime=str(prepare_clusterer_time -
                                              load_data_time),
                clustering_runtime=str(fit_time - prepare_clusterer_time),
                total_runtime=str(fit_time - start_time),
                threshold=getattr(
                    cluster,
                    "best_threshold_",
                    clusterer.clusterer.base_estimator.threshold,
                ),
                signature_block=phonetic_block,
            )
            (
                labels_train_per_block,
                y_train_per_block,
                labels_test_per_block,
                y_test_per_block,
            ) = clusterer.prepare_test_data(test_signatures_uuids, test_labels)
            (
                B3_statistics_all_per_block,
                B3_statistics_training_per_block,
                B3_statistics_test_per_block,
            ) = clusterer.score(
                labels_train_per_block,
                y_train_per_block,
                labels_test_per_block,
                y_test_per_block,
            )
            nb_of_clusters_per_author = clusterer.nb_of_clusters_predicted_for_author(
                input_clusters_with_all_labels, test_authors_ids)
            LOGGER.info(
                "Clustering results for block {}".format(block),
                train_dataset_size=y_train_per_block.size,
                test_dataset_size=y_test_per_block.size,
                true_number_of_clusters=np.unique(clusterer.y).size,
                predicted_number_of_clusters=np.unique(
                    clusterer.clusterer.labels_).size,
                B3_precision_recall_f_score_all=dict(
                    zip(statistics_names, B3_statistics_all_per_block)),
                B3_precision_recall_f_score_training=dict(
                    zip(statistics_names, B3_statistics_training_per_block))
                if B3_statistics_training_per_block else None,
                B3_precision_recall_f_score_test=dict(
                    zip(statistics_names, B3_statistics_test_per_block))
                if B3_statistics_test_per_block else None,
                nb_of_clusters_per_author=nb_of_clusters_per_author)
            labels_train = np.concatenate(
                (labels_train, labels_train_per_block))
            y_train = np.concatenate((y_train, y_train_per_block))
            labels_test = np.concatenate((labels_test, labels_test_per_block))
            y_test = np.concatenate((y_test, y_test_per_block))

    B3_statistics_training = b3_precision_recall_fscore(y_train, labels_train)
    B3_statistics_test = b3_precision_recall_fscore(y_test, labels_test)
    B3_statistics_all = b3_precision_recall_fscore(
        np.append(y_train, y_test), np.append(labels_train, labels_test))
    LOGGER.info(
        "Clustering results for all the blocks",
        B3_precision_recall_f_score_all=B3_statistics_all,
        B3_statistics_training=B3_statistics_training,
        B3_statistics_test=B3_statistics_test,
    )
def test_distance_estimator_load_data(scan_mock,
                                      es_record_with_many_curated_authors):
    scan_mock.side_effect = [[es_record_with_many_curated_authors]]
    signatures = get_signatures()
    pairs = [
        {
            "same_cluster":
            True,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e53",
            ],
        },
        {
            "same_cluster":
            True,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e54",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e55",
            ],
        },
        {
            "same_cluster":
            False,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e56",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e57",
            ],
        },
        {
            "same_cluster":
            False,
            "signature_uuids": [
                "94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                "94fc2b0a-dc17-42c2-bae3-ca0024079e54",
            ],
        },
    ]
    distance_estimator = DistanceEstimator(None)
    distance_estimator.load_data(signatures, pairs, 4)
    expected_X = array(
        [
            [
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=1,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                    is_curated_author_id=True,
                ),
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=1,
                    author_name="Doe, J",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e53",
                    is_curated_author_id=True),
            ],
            [
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=2,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54",
                    is_curated_author_id=True),
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=2,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55",
                    is_curated_author_id=True),
            ],
            [
                Signature(
                    author_affiliation="",
                    author_id=6,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e56",
                    is_curated_author_id=True),
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=7,
                    author_name="Jamie",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="Jana",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e57",
                    is_curated_author_id=True),
            ],
            [
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=1,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                    is_curated_author_id=True),
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=2,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="Many curated authors",
                        authors=[
                            "Doe, John",
                            "Doe, J",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Doe, John",
                            "Jamie",
                            "Jamie",
                        ],
                        collaborations=[],
                        keywords=["keyword"],
                        publication_id=1,
                        title="Title",
                        topics=["category"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e54",
                    is_curated_author_id=True),
            ],
        ],
        dtype=object,
    )
    expected_y = array([0, 0, 1, 1])
    assert (distance_estimator.X == expected_X).all()
    assert (distance_estimator.y == expected_y).all()
def test_clusterer_load_data(
    scan_mock,
    distance_estimator_mock,
    es_record_with_curated_author,
    es_record_with_non_curated_author,
):
    scan_mock.side_effect = [[
        es_record_with_curated_author, es_record_with_non_curated_author
    ]]
    signatures = get_signatures()
    input_clusters = get_input_clusters(signatures)
    clusterer = Clusterer(distance_estimator_mock)
    clusterer.load_data(signatures, input_clusters)
    expected_X = array(
        [
            [
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=989441,
                    author_name="Doe, John",
                    publication=Publication(
                        abstract="2 curated authors with recid",
                        authors=["Doe, John"],
                        collaborations=["ATLAS"],
                        keywords=[
                            "effective action", "approximation: semiclassical"
                        ],
                        publication_id=374836,
                        title="Title",
                        topics=["Theory-HEP"],
                    ),
                    signature_block="JOhn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52",
                    is_curated_author_id=True)
            ],
            [
                Signature(
                    author_affiliation="Rutgers U., Piscataway",
                    author_id=989443,
                    author_name="Seiberg, Nana.",
                    publication=Publication(
                        abstract="Author curated no recid",
                        authors=["Seiberg, Nana."],
                        collaborations=[],
                        keywords=["thesis", "string model"],
                        publication_id=421404,
                        title="Black holes in string theory",
                        topics=["Theory-HEP"],
                    ),
                    signature_block="SABARGn",
                    signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e51",
                    is_curated_author_id=False,
                )
            ],
        ],
        dtype=object,
    )

    expected_y = array([0, -1])
    assert (clusterer.X == expected_X).all()
    assert (clusterer.y == expected_y).all()
Example #15
0
def test_get_signatures_for_all(
    scan_mock,
    es_record_with_2_curated_authors,
    es_record_with_curated_author_and_no_recid,
    es_record_with_non_curated_author,
):
    scan_mock.side_effect = [[
        es_record_with_2_curated_authors,
        es_record_with_curated_author_and_no_recid,
        es_record_with_non_curated_author,
    ]]
    signatures = get_signatures()
    expected_signatures = [
        Signature(
            author_affiliation="Rutgers U., Piscataway",
            author_id=989440,
            author_name="Seiberg, N.",
            publication=Publication(
                abstract="2 curated authors with recid",
                authors=["Seiberg, N.", "Jimmy"],
                collaborations=[],
                keywords=["effective action", "approximation: semiclassical"],
                publication_id=374836,
                title="Title",
                topics=["Theory-HEP"],
            ),
            signature_block="SABARGn",
            signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e52",
            is_curated_author_id=True),
        Signature(
            author_affiliation="UAIC",
            author_id=989440,
            author_name="Jimmy",
            publication=Publication(
                abstract="2 curated authors with recid",
                authors=["Seiberg, N.", "Jimmy"],
                collaborations=[],
                keywords=["effective action", "approximation: semiclassical"],
                publication_id=374836,
                title="Title",
                topics=["Theory-HEP"],
            ),
            signature_block="JANa",
            signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e55",
            is_curated_author_id=True,
        ),
        Signature(
            author_affiliation="Texas U.",
            author_id=None,
            author_name="Weinberg, Steven",
            publication=Publication(
                abstract="Author not curated",
                authors=["Weinberg, Steven"],
                collaborations=[],
                keywords=["book"],
                publication_id=406190,
                title="The Quantum theory of fields. Vol. 1: Foundations",
                topics=["Theory-HEP", "General Physics"],
            ),
            signature_block="WANBARGs",
            signature_uuid="5e550ded-e955-4a22-b906-8af5aaa9f1e2",
            is_curated_author_id=False),
        Signature(author_affiliation="Rutgers U., Piscataway",
                  author_id=989443,
                  author_name="Seiberg, Nana.",
                  publication=Publication(
                      abstract="Author curated no recid",
                      authors=["Seiberg, Nana."],
                      collaborations=[],
                      keywords=["thesis", "string model"],
                      publication_id=421404,
                      title="Black holes in string theory",
                      topics=["Theory-HEP"],
                  ),
                  signature_block="SABARGn",
                  signature_uuid="94fc2b0a-dc17-42c2-bae3-ca0024079e51",
                  is_curated_author_id=False),
    ]
    assert sorted(signatures, key=itemgetter("signature_uuid")) == sorted(
        expected_signatures, key=itemgetter("signature_uuid"))