def test_krebs_dataset(): krebs_data_block1, krebs_matches_block1 = load_krebsregister(1) krebs_data_block10, krebs_matches_block10 = load_krebsregister(10) assert len(krebs_data_block1) > 0 assert len(krebs_data_block10) > 0 # load not existing block with pytest.raises(ValueError): load_krebsregister(11) # missing values krebs_block10, matches = load_krebsregister(10, missing_values=0) assert krebs_block10.isnull().sum().sum() == 0
def test_krebs_dataset_environ(tmpdir): path = Path(str(tmpdir)).expanduser() environ['RL_DATA'] = str(path) krebs_data, krebs_matches = load_krebsregister() for i in range(1, 11): assert Path(path, "krebsregister", "block_{}.zip".format(i)).is_file()
def test_krebs_dataset(self): krebs_data, krebs_matches = load_krebsregister() krebs_data_block1, krebs_matches_block1 = load_krebsregister(1) krebs_data_block10, krebs_matches_block10 = load_krebsregister(10) # count the number of recordss self.assertEqual(type(krebs_data), pandas.DataFrame) self.assertEqual(type(krebs_matches), pandas.MultiIndex) self.assertEqual(len(krebs_data), 5749132) self.assertEqual(len(krebs_matches), 20931) self.assertGreater(len(krebs_data_block1), 0) self.assertGreater(len(krebs_data_block10), 0) # load not existing block self.assertRaises(ValueError, load_krebsregister, 11) # missing values krebs_data_block10, krebs_matches = load_krebsregister(10, missing_values=0) self.assertEqual(krebs_data_block10.isnull().sum().sum(), 0)
def test_krebs_dataset_download(): # remove downloaded datasets clear_data_home() krebs_data, krebs_matches = load_krebsregister() for i in range(1, 11): assert Path(get_data_home(), "krebsregister", "block_{}.zip".format(i)).is_file() # count the number of recordss assert type(krebs_data), pandas.DataFrame assert type(krebs_matches), pandas.MultiIndex assert len(krebs_data) == 5749132 assert len(krebs_matches) == 20931
def test_krebs_dataset(self): krebs_data, krebs_matches = load_krebsregister() krebs_data_block1, krebs_matches_block1 = load_krebsregister(1) krebs_data_block10, krebs_matches_block10 = load_krebsregister(10) # count the number of recordss assert type(krebs_data), pandas.DataFrame assert type(krebs_matches), pandas.MultiIndex assert len(krebs_data) == 5749132 assert len(krebs_matches) == 20931 assert len(krebs_data_block1) > 0 assert len(krebs_data_block10) > 0 # load not existing block with pytest.raises(ValueError): load_krebsregister(11) # missing values krebs_block10, matches = load_krebsregister(10, missing_values=0) assert krebs_block10.isnull().sum().sum() == 0
def test_krebs_shuffle(): # missing values krebs_block10, matches = load_krebsregister(10, shuffle=False)
def test_krebs_missings(): # missing values krebs_block10, matches = load_krebsregister(10, missing_values=0) assert krebs_block10.isnull().sum().sum() == 0
def test_krebs_shuffle(self): # missing values krebs_data_block10, krebs_matches = load_krebsregister(10, shuffle=False)
def test_krebs_missings(self): # missing values krebs_data_block10, krebs_matches = load_krebsregister(10, missing_values=0) self.assertEqual(krebs_data_block10.isnull().sum().sum(), 0)
import recordlinkage as rl from recordlinkage.datasets import load_krebsregister krebs_X, krebs_true_links = load_krebsregister(missing_values=0) print(krebs_true_links) # Train the classifier ecm = rl.ECMClassifier(binarize=0.8) result_ecm = ecm.fit_predict(krebs_X) len(result_ecm) print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X))) # The F-score for this classification is print(rl.fscore(krebs_true_links, result_ecm)) print(ecm.log_weights)