Example #1
0
class TestData(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "name_similarity",
            "affiliation_similarity",
            "email_similarity",
            "coauthor_similarity",
            "venue_similarity",
            "year_diff",
            "title_similarity",
            "reference_features",
            "misc_features",
            "name_counts",
            "journal_similarity",
            "advanced_name_similarity",
        ]
        self.dummy_featurizer = FeaturizationInfo(features_to_use=features_to_use)

    def check_features_array_equal(self, array_1, array_2):
        assert len(array_1) == len(array_2)
        for i in range(len(array_1)):
            both_nan = np.isnan(array_1[i]) and np.isnan(array_2[i])
            if not both_nan:
                self.assertAlmostEqual(array_1[i], array_2[i], msg=i)

    def test_featurizer(self):
        test_pairs = [
            ("3", "0", 0),
            ("3", "1", 0),
            ("3", "2", 0),
            ("3", "2", -1),
        ]
        features, labels, _ = many_pairs_featurize(
            test_pairs, self.dummy_dataset, self.dummy_featurizer, 2, False, 1, nan_value=-1
        )

        expected_features_1 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            4.0,
            0.0,
            0.03067484662576687,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            -1.0,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            82081.0,
            12.0,
            807.0,
            1.0,
            -1.0,
            -1.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        expected_features_2 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            6.0,
            0.02857142857142857,
            0.09615384615384616,
            0.25757575757575757,
            0.34615384615384615,
            0.8181818181818182,
            0.2222222222222222,
            0.0,
            0.5,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            23425.0,
            12.0,
            807.0,
            1.0,
            82081.0,
            20.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        expected_features_3 = [
            0.0,
            -1.0,
            -1.0,
            0.0,
            0.0,
            0.0,
            0.2,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            0.0,
            6.0,
            0.0,
            0.058823529411764705,
            -1.0,
            -1.0,
            -1.0,
            -1.0,
            1.0,
            -1.0,
            1.0,
            2.0,
            2.0,
            1.0,
            2.0,
            23425.0,
            12.0,
            807.0,
            1.0,
            82081.0,
            20.0,
            -1.0,
            0.7777777777777778,
            0.8,
            0.7777777777777778,
            0.5407407407407407,
        ]
        self.check_features_array_equal(list(features[0, :]), expected_features_1)
        self.check_features_array_equal(list(features[1, :]), expected_features_2)
        self.check_features_array_equal(list(features[2, :]), expected_features_3)
        self.assertEqual(features[3, 0], -LARGE_INTEGER)

    def test_get_constraint(self):
        first_constraint = self.dummy_dataset.get_constraint("0", "8", high_value=100)
        assert first_constraint == 100
        middle_constraint = self.dummy_dataset.get_constraint("6", "8", high_value=100)
        assert middle_constraint == 100
        no_constraint = self.dummy_dataset.get_constraint("0", "1")
        assert no_constraint is None
Example #2
0
class TestClusterer(unittest.TestCase):
    def setUp(self):
        super().setUp()
        self.dummy_dataset = ANDData(
            "tests/dummy/signatures.json",
            "tests/dummy/papers.json",
            clusters="tests/dummy/clusters.json",
            cluster_seeds="tests/dummy/cluster_seeds.json",
            name="dummy",
            load_name_counts=True,
        )

        features_to_use = [
            "year_diff",
            "misc_features",
        ]
        featurizer_info = FeaturizationInfo(features_to_use=features_to_use)
        np.random.seed(1)
        X_random = np.random.random((10, 6))
        y_random = np.random.randint(0, 6, 10)
        self.dummy_clusterer = Clusterer(
            featurizer_info=featurizer_info,
            classifier=lgb.LGBMClassifier(random_state=1,
                                          data_random_seed=1,
                                          feature_fraction_seed=1).fit(
                                              X_random, y_random),
            n_jobs=1,
            use_cache=False,
            use_default_constraints_as_supervision=False,
        )

    def test_get_constraints(self):
        block = {
            "a sattar": ["0", "1", "2"],
        }
        constraint_1 = self.dummy_dataset.get_constraint("0",
                                                         "1",
                                                         low_value=0,
                                                         high_value=2)
        constraint_2 = self.dummy_dataset.get_constraint("1",
                                                         "0",
                                                         low_value=0,
                                                         high_value=2)
        constraint_3 = self.dummy_dataset.get_constraint("1",
                                                         "2",
                                                         low_value=0,
                                                         high_value=2)
        constraint_4 = self.dummy_dataset.get_constraint("2",
                                                         "1",
                                                         low_value=0,
                                                         high_value=2)

        self.assertIs(constraint_1, LARGE_DISTANCE)
        self.assertIs(constraint_2, LARGE_DISTANCE)
        self.assertIs(constraint_3, 0)
        self.assertIs(constraint_4, 0)

    def test_make_distance_matrix_fastcluster(self):
        block = {
            "a sattar": ["0", "1", "2"],
        }
        partial_supervision = {("0", "1"): 1.1, ("1", "2"): 1e-6}
        distance_matrices = self.dummy_clusterer.make_distance_matrices(
            block_dict=block,
            dataset=self.dummy_dataset,
            partial_supervision=partial_supervision,
        )
        distance_matrix = distance_matrices["a sattar"]
        self.assertEqual(distance_matrix[0], np.float16(1.1))
        self.assertEqual(distance_matrix[1], np.float16(0.3))
        self.assertEqual(distance_matrix[2], np.float16(1e-6))

        distance_matrices = self.dummy_clusterer.make_distance_matrices(
            block_dict=block,
            dataset=self.dummy_dataset,
            partial_supervision={},
        )
        distance_matrix = distance_matrices["a sattar"]
        self.assertEqual(distance_matrix[0], np.float16(0.3))
        self.assertEqual(distance_matrix[1], np.float16(0.3))
        self.assertEqual(distance_matrix[2], np.float16(0.3))