Example #1
0
    def test_binarize_input(self):
        m = np.array([1, .81, .85, .81, .85, .81])
        u = np.array([1, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)
        X_train = X_train * np.random.rand(*X_train.shape)

        # Create the train dataset.
        X_test, true_links = binary_vectors(1000,
                                            500,
                                            m=m,
                                            u=u,
                                            random_state=535,
                                            return_links=True)
        X_test = X_test * np.random.rand(*X_test.shape)

        ecm = rl.ECMClassifier(binarize=True)
        ecm.fit(X_train)
        ecm.predict(X_test)
Example #2
0
    def test_ecm_atol_none(self):
        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(10000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        # Create the train dataset.
        X_test, true_links = binary_vectors(1000,
                                            500,
                                            m=m,
                                            u=u,
                                            random_state=535,
                                            return_links=True)

        ecm = rl.ECMClassifier(atol=None)
        ecm.fit(X_train)
        ecm.predict(X_test)

        assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3)
        assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)
Example #3
0
    def render_bin_test_data(cls,
                             n_pairs_train=5000,
                             n_matches_train=1000,
                             n_pairs_test=50000,
                             n_matches_test=10000):

        cls.m = np.array([.92, .81, .85, .90, .99, .70, .56])
        cls.u = np.array([.19, .23, .50, .11, .20, .14, .50])

        cls.labels = [
            'name', 'second_name', 'surname', 'dob', 'street', 'state',
            'zipcode'
        ]

        # Create the train dataset.
        cls.X_train, cls.y_train = binary_vectors(n_pairs_train,
                                                  n_matches_train,
                                                  m=cls.m,
                                                  u=cls.u,
                                                  random_state=535,
                                                  return_links=True)

        cls.X_train.columns = cls.labels

        # Create the test dataset.
        cls.X_test, cls.y_test = binary_vectors(n_pairs_test,
                                                n_matches_test,
                                                m=cls.m,
                                                u=cls.u,
                                                random_state=535,
                                                return_links=True)

        cls.y_test.columns = cls.labels
Example #4
0
    def test_ecm_init_jaro_1value(self):

        m = np.array([1.0, 0.85, .85, .81, .85, .81])
        u = np.array([1.0, .10, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        ecm = rl.ECMClassifier(init='jaro')
        ecm.fit(X_train)
        ecm.predict(X_train)

        with pytest.raises(KeyError):
            ecm.m_probs['c_1'][0]

        assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
        assert math.isclose(ecm.u_probs['c_1'][1], 1.0, abs_tol=0.01)
        assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05)
        assert math.isclose(ecm.p, 0.5, abs_tol=0.05)
def make_fake_data(n1, n2, pM, pML, pUL, randState=113):
    nPair = n1 * n2
    L = len(pML)
    gamma, links =np.array(datasets.binary_vectors(nPair, int(pM*nPair), \
                m=pML, u = pUL, random_state=randState, return_links = True))

    gamma['match'] = False
    gamma.loc[links, 'match'] = True
    matches = gamma['match']
    # make pair identifiers
    i = [[i] * n1 for i in range(n2)]
    iVals = []
    for x in i:
        iVals += x
    jVals = [j for j in range(n1)] * n2

    Gamma = pd.DataFrame({
        'gamma': list(gamma[['c_1', 'c_2', 'c_3']].values),
        'i': iVals,
        'j': jVals,
        'match': matches
    })
    Gamma = Gamma.reset_index(drop=True)
    ext = 'nMatch' + str(int(pM * nPair)) + '_L' + str(L)
    Gamma.to_csv('Gamma_' + ext + '.csv', mode='w')
    return Gamma
Example #6
0
    def test_sklearn_preinit(self):

        m = np.array([1.0, .81, .85, .81, .85, .81])
        u = np.array([1.0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        binarizer = LabelBinarizer()
        binarizer.classes_ = np.array([0, 1])

        binarizer.transform(X_train.iloc[:, 1])
        assert len(binarizer.classes_) == 2
Example #7
0
    def test_random_comparison_vectors(self):
        # Test the generation of a random dataset

        n_record_pairs = 10000
        n_matches = 500

        df = binary_vectors(n_record_pairs,
                            n_matches,
                            m=[0.8] * 8,
                            u=[0.2] * 8,
                            random_state=535)

        # Check the result is a DataFrame with MultiIndex
        self.assertIsInstance(df, pandas.DataFrame)
        self.assertIsInstance(df.index, pandas.MultiIndex)

        # Test the length of the dataframe
        self.assertEqual(len(df), n_record_pairs)
Example #8
0
def make_Gamma(n1, n2, pM, pML, pUL):
    nPair = n1 * n2
    L = len(pML)
    gamma = np.array(
        datasets.binary_vectors(nPair,
                                int(pM * nPair),
                                m=pML,
                                u=pUL,
                                random_state=113))
    i = [[i] * n1 for i in range(n2)]
    iVals = []
    for x in i:
        iVals += x
    jVals = [j for j in range(n1)] * n2
    Gamma = pd.DataFrame({
        'gamma': [list(gamma[i]) for i in range(len(gamma))],
        'i': iVals,
        'j': jVals
    })
    return Gamma
Example #9
0
def test_random_comparison_vectors_1value_col():

    m = numpy.array([1, .81, .85, 0])
    u = numpy.array([1, .23, .50, 0])

    # Create the train dataset.
    X_train, y_train = binary_vectors(1000,
                                      500,
                                      m=m,
                                      u=u,
                                      random_state=535,
                                      return_links=True)

    assert len(X_train.iloc[:, 0].unique()) == 1
    assert X_train.iloc[:, 0].unique()[0] == 1

    assert len(X_train.iloc[:, 3].unique()) == 1
    assert X_train.iloc[:, 3].unique()[0] == 0

    assert len(X_train.iloc[:, 1].unique()) == 2
    assert len(X_train.iloc[:, 2].unique()) == 2
Example #10
0
    def test_fs_column_labels(self, classifier):

        m = np.array([0.95, .81, .85, .81, .85, .81])
        u = np.array([0, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        cl = classifier()
        if isinstance(cl, tuple(UNSUPERVISED_CLASSIFIERS)):
            cl.fit(X_train)
        else:
            cl.fit(X_train, true_links)

        assert set([*cl.m_probs]) == set(list(X_train))
        assert set([*cl.u_probs]) == set(list(X_train))
        assert set([*cl.log_m_probs]) == set(list(X_train))
        assert set([*cl.log_m_probs]) == set(list(X_train))
Example #11
0
    def test_ecm_init(self):

        m = np.array([0.23, .81, .85, .81, .85, .81])
        u = np.array([0.34, .23, .50, .23, .30, 0.13])

        # Create the train dataset.
        X_train, true_links = binary_vectors(1000,
                                             500,
                                             m=m,
                                             u=u,
                                             random_state=535,
                                             return_links=True)

        ecm = rl.ECMClassifier(init='random')
        ecm.fit(X_train)
        ecm.predict(X_train)

        print(ecm.m_probs)
        print(ecm.log_m_probs)
        print(ecm.u_probs)
        print(ecm.log_u_probs)

        assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
Example #12
0
import numpy as np

import recordlinkage as rl
from recordlinkage.datasets import binary_vectors

# create a dataset with the following settings
n_pairs = 50000
n_matches = 7000
m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92])
u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09])

# Create the dataset and return the true links.
X_data, links_true = binary_vectors(
    n_pairs,  # the number of candidate links
    n_matches,  # the number of true links
    m=m_simulate,  # the m probabilities
    u=u_simulate,  # the u probabilities
    random_state=535,  # set seed
    return_links=True)  # return true links

# Initialise the NaiveBayesClassifier.
cl = rl.NaiveBayesClassifier()
cl.fit(X_data, links_true)

# Print the parameters that are trained (m, u and p). Note that the estimates
# are very good.
print("p probability P(Match):", cl.p)
print("m probabilities P(x_i=1|Match):", cl.m_probs)
print("u probabilities P(x_i=1|Non-Match):", cl.u_probs)
print("log m probabilities P(x_i=1|Match):", cl.log_m_probs)
print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs)