コード例 #1
0
def test_krebs_dataset():
    krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
    krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)

    assert len(krebs_data_block1) > 0
    assert len(krebs_data_block10) > 0

    # load not existing block
    with pytest.raises(ValueError):
        load_krebsregister(11)

    # missing values
    krebs_block10, matches = load_krebsregister(10, missing_values=0)
    assert krebs_block10.isnull().sum().sum() == 0
コード例 #2
0
def test_krebs_dataset_environ(tmpdir):

    path = Path(str(tmpdir)).expanduser()
    environ['RL_DATA'] = str(path)

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(path, "krebsregister", "block_{}.zip".format(i)).is_file()
コード例 #3
0
    def test_krebs_dataset(self):

        krebs_data, krebs_matches = load_krebsregister()
        krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
        krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)

        # count the number of recordss
        self.assertEqual(type(krebs_data), pandas.DataFrame)
        self.assertEqual(type(krebs_matches), pandas.MultiIndex)
        self.assertEqual(len(krebs_data), 5749132)
        self.assertEqual(len(krebs_matches), 20931)

        self.assertGreater(len(krebs_data_block1), 0)
        self.assertGreater(len(krebs_data_block10), 0)

        # load not existing block
        self.assertRaises(ValueError, load_krebsregister, 11)

        # missing values
        krebs_data_block10, krebs_matches = load_krebsregister(10, missing_values=0)
        self.assertEqual(krebs_data_block10.isnull().sum().sum(), 0)
コード例 #4
0
def test_krebs_dataset_download():

    # remove downloaded datasets
    clear_data_home()

    krebs_data, krebs_matches = load_krebsregister()

    for i in range(1, 11):
        assert Path(get_data_home(), "krebsregister",
                    "block_{}.zip".format(i)).is_file()

    # count the number of recordss
    assert type(krebs_data), pandas.DataFrame
    assert type(krebs_matches), pandas.MultiIndex
    assert len(krebs_data) == 5749132
    assert len(krebs_matches) == 20931
コード例 #5
0
    def test_krebs_dataset(self):

        krebs_data, krebs_matches = load_krebsregister()
        krebs_data_block1, krebs_matches_block1 = load_krebsregister(1)
        krebs_data_block10, krebs_matches_block10 = load_krebsregister(10)

        # count the number of recordss
        assert type(krebs_data), pandas.DataFrame
        assert type(krebs_matches), pandas.MultiIndex
        assert len(krebs_data) == 5749132
        assert len(krebs_matches) == 20931

        assert len(krebs_data_block1) > 0
        assert len(krebs_data_block10) > 0

        # load not existing block
        with pytest.raises(ValueError):
            load_krebsregister(11)

        # missing values
        krebs_block10, matches = load_krebsregister(10, missing_values=0)
        assert krebs_block10.isnull().sum().sum() == 0
コード例 #6
0
def test_krebs_shuffle():

    # missing values
    krebs_block10, matches = load_krebsregister(10, shuffle=False)
コード例 #7
0
def test_krebs_missings():

    # missing values
    krebs_block10, matches = load_krebsregister(10, missing_values=0)
    assert krebs_block10.isnull().sum().sum() == 0
コード例 #8
0
    def test_krebs_shuffle(self):

        # missing values
        krebs_data_block10, krebs_matches = load_krebsregister(10, shuffle=False)
コード例 #9
0
    def test_krebs_missings(self):

        # missing values
        krebs_data_block10, krebs_matches = load_krebsregister(10, missing_values=0)
        self.assertEqual(krebs_data_block10.isnull().sum().sum(), 0)
コード例 #10
0
import recordlinkage as rl
from recordlinkage.datasets import load_krebsregister

krebs_X, krebs_true_links = load_krebsregister(missing_values=0)

print(krebs_true_links)

# Train the classifier
ecm = rl.ECMClassifier(binarize=0.8)
result_ecm = ecm.fit_predict(krebs_X)

len(result_ecm)

print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X)))

# The F-score for this classification is
print(rl.fscore(krebs_true_links, result_ecm))

print(ecm.log_weights)