def test_geo(self):

        comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

        # Missing values
        result = comp.geo('age',
                          'age',
                          'age',
                          'age',
                          method='linear',
                          offset=2,
                          scale=2)

        self.assertFalse(result.isnull().all())
    def test_indexing_types(self):
        # test the two types of indexing

        # this test needs improvement

        A = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']})
        B = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']})
        B_reversed = B[::-1].copy()
        ix = MultiIndex.from_arrays([np.arange(5), np.arange(5)])

        # test with label indexing type
        comp_label = recordlinkage.Compare(indexing_type='label')
        comp_label.exact('col', 'col')
        result_label = comp_label.compute(ix, A, B_reversed)

        # test with position indexing type
        comp_position = recordlinkage.Compare(indexing_type='position')
        comp_position.exact('col', 'col')
        result_position = comp_position.compute(ix, A, B_reversed)

        assert (result_position.values == 1).all(axis=0)

        pdt.assert_frame_equal(result_label, result_position)
    def test_numeric_algorithms(self, alg):

        A = DataFrame({'col': [1, 1, 1, 1, 1]})
        B = DataFrame({'col': [1, 2, 3, 4, 5]})
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        comp = recordlinkage.Compare()
        comp.numeric('col', 'col', method='step', offset=1, label='step')
        comp.numeric(
            'col', 'col', method='linear', offset=1, scale=2, label='linear')
        comp.numeric(
            'col', 'col', method='squared', offset=1, scale=2, label='squared')
        comp.numeric(
            'col', 'col', method='exp', offset=1, scale=2, label='exp')
        comp.numeric(
            'col', 'col', method='gauss', offset=1, scale=2, label='gauss')
        result_df = comp.compute(ix, A, B)

        result = result_df[alg]

        # All values between 0 and 1.
        assert (result >= 0.0).all()
        assert (result <= 1.0).all()

        if alg is not 'step':

            print(alg)
            print(result)

            # sim(scale) = 0.5
            expected_bool = Series(
                [False, False, False, True, False], index=ix, name=alg)
            pdt.assert_series_equal(result == 0.5, expected_bool)

            # sim(offset) = 1
            expected_bool = Series(
                [True, True, False, False, False], index=ix, name=alg)
            pdt.assert_series_equal(result == 1.0, expected_bool)

            # sim(scale) larger than 0.5
            expected_bool = Series(
                [False, False, True, False, False], index=ix, name=alg)
            pdt.assert_series_equal((result > 0.5) & (result < 1.0),
                                    expected_bool)

            # sim(scale) smaller than 0.5
            expected_bool = Series(
                [False, False, False, False, True], index=ix, name=alg)
            pdt.assert_series_equal((result < 0.5) & (result >= 0.0),
                                    expected_bool)
    def test_compare_custom_vectorized_dedup(self):

        A = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']})
        ix = MultiIndex.from_arrays([[1, 2, 3, 4, 5], [2, 3, 4, 5, 1]])

        # test without label
        comp = recordlinkage.Compare()
        comp.compare_vectorized(lambda s1, s2: np.ones(len(s1), dtype=np.int),
                                'col', 'col')
        result = comp.compute(ix, A)
        expected = DataFrame([1, 1, 1, 1, 1], index=ix)
        pdt.assert_frame_equal(result, expected)

        # test with label
        comp = recordlinkage.Compare()
        comp.compare_vectorized(
            lambda s1, s2: np.ones(len(s1), dtype=np.int),
            'col',
            'col',
            label='test')
        result = comp.compute(ix, A)
        expected = DataFrame([1, 1, 1, 1, 1], index=ix, columns=['test'])
        pdt.assert_frame_equal(result, expected)
    def test_parallel_comparing(self):

        # use single job
        comp = recordlinkage.Compare(n_jobs=1)
        comp.exact('given_name', 'given_name', label='my_feature_label')
        result_single = comp.compute(self.index_AB, self.A, self.B)
        result_single.sort_index(inplace=True)

        # use two jobs
        comp = recordlinkage.Compare(n_jobs=2)
        comp.exact('given_name', 'given_name', label='my_feature_label')
        result_2processes = comp.compute(self.index_AB, self.A, self.B)
        result_2processes.sort_index(inplace=True)

        # use two jobs
        comp = recordlinkage.Compare(n_jobs=4)
        comp.exact('given_name', 'given_name', label='my_feature_label')
        result_4processes = comp.compute(self.index_AB, self.A, self.B)
        result_4processes.sort_index(inplace=True)

        # compare results
        pdt.assert_frame_equal(result_single, result_2processes)
        pdt.assert_frame_equal(result_single, result_4processes)
Beispiel #6
0
    def test_numeric_does_not_exist(self):
        """
        Raise error is the algorithm doesn't exist.
        """

        A = DataFrame({'col': [1, 1, 1, nan, 0]})
        B = DataFrame({'col': [1, 1, 1, nan, nan]})
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        comp = recordlinkage.Compare()

        with self.assertRaises(ValueError):
            comp.numeric('col', 'col', method='unknown_algorithm')
            comp.compute(ix, A, B)
    def test_repr(self):

        comp = recordlinkage.Compare()
        comp.exact('given_name', 'given_name')
        comp.string('given_name', 'given_name', method='jaro')
        comp.numeric('age', 'age', method='step', offset=3, origin=2)
        comp.numeric('age', 'age', method='step', offset=0, origin=2)

        c_str = str(comp)
        c_repr = repr(comp)
        assert c_str == c_repr

        start_str = '<{}'.format(comp.__class__.__name__)
        assert c_str.startswith(start_str)
Beispiel #8
0
def ProcessData(patientDataList, fetchedHospitalData):
    # Read from the directory
    filelist = pd.read_csv(
        '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/'
        + fetchedHospitalData)

    # Indexation step
    indexer = p.Index()
    indexer.add(Block(left_on='fatherName', right_on='fatherName'))
    candidate_links = indexer.index(patientDataList, filelist)

    # print((candidate_links))

    # Comparison step
    compare_cl = p.Compare()

    # compare_cl.exact('_id','_id',label='_id')
    compare_cl.exact('name', 'name', label='name')
    compare_cl.exact('fatherName', 'fatherName', label='fatherName')
    compare_cl.exact('grandFatherName',
                     'grandFatherName',
                     label='grandFatherName')
    compare_cl.exact('gender', 'gender', label='gender')
    compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth')
    compare_cl.exact('dayOfBirth', 'dayOfBirth', label='dayOfBirth')
    compare_cl.exact('monthOfBirth', 'monthOfBirth', label='monthOfBirth')
    compare_cl.exact('yearOfBirth', 'yearOfBirth', label='yearOfBirth')
    compare_cl.exact('age', 'age', label='age')
    # compare_cl.exact('address','address',label='address')
    # compare_cl.exact('phoneNumber','phoneNumber',label='phoneNumber')

    features = compare_cl.compute(candidate_links, patientDataList, filelist)

    if features.empty:
        return None
    else:

        # Classification step
        '''
            Use the KMeans Classifier
            This classifier is equivalent to the Unsupervised record linkage approach
        '''

        # # classifier = p.LogisticRegressionClassifier(coefficients=coefficients,intercept=intercept)
        classifier = p.LogisticRegressionClassifier()
        classifier.fit(golden_pairs, golden_matches_index)

        links = classifier.predict(features)

        return links
    def time_global_large(self):

        c_compare = rl.Compare(self.pairs_large, self.A, self.B)
        c_compare.string('given_name', 'given_name', method='jaro')
        c_compare.string('surname',
                         'surname',
                         method='jarowinkler',
                         threshold=0.85)
        c_compare.date('date_of_birth', 'date_of_birth')
        c_compare.exact('suburb', 'suburb')
        c_compare.exact('state', 'state')
        c_compare.string('address_1',
                         'address_1',
                         method='levenshtein',
                         threshold=0.85)
    def test_date_incorrect_dtype(self):

        A = DataFrame({
            'col':
            ['2005/11/23', nan, '2004/11/23', '2010/01/10', '2010/10/30']
        })
        B = DataFrame({
            'col': [
                '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01',
                '2010/9/30'
            ]
        })
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        A['col1'] = to_datetime(A['col'])
        B['col1'] = to_datetime(B['col'])

        comp = recordlinkage.Compare()
        comp.date('col', 'col1')
        self.assertRaises(ValueError, comp.compute, ix, A, B)

        comp = recordlinkage.Compare()
        comp.date('col1', 'col')
        self.assertRaises(ValueError, comp.compute, ix, A, B)
Beispiel #11
0
    def test_memory_low(self):

        c = recordlinkage.Compare(self.index_AB,
                                  self.A,
                                  self.B,
                                  low_memory=True)

        c.numeric('age', 'age', method='linear', offset=0, scale=3)
        c.exact('given_name', 'given_name')
        c.string('lastname', 'lastname', method='levenshtein')
        c.string('street', 'street', method='levenshtein')

        nrows, ncols = c.vectors.shape
        self.assertEqual(nrows, len(self.index_AB))
        self.assertEqual(ncols, 4)
Beispiel #12
0
def compute_record_linkage(df_full):
    """Given the fully-concatenated table of records
    calculate which pairs are address-matches using
    fuzzy matching on street name, unit number and zipcode
    """

    print("Setting up blocking for pairwise comparisons")
    blocking_indices = [
        recordlinkage.BlockIndex(on="APN (int)"),
        recordlinkage.BlockIndex(
            on=["Address Number (float)", "Zip Code (int)"]),
    ]

    print("Finding blocked pairs")
    pairs = None
    for df_subset in tqdm(np.array_split(df_full, 10)):
        for bi in blocking_indices:
            _new_pairs = bi.index(df_full, df_subset)

            if pairs is not None:
                pairs = pairs.union(_new_pairs)
            else:
                pairs = _new_pairs

    print("Setting up similarity calculations")
    compare_cl = recordlinkage.Compare()

    compare_cl.exact('APN (int)', 'APN (int)', label='APN')
    compare_cl.exact('Zip Code (int)', 'Zip Code (int)', label='Zip')
    compare_cl.exact('Address Number (float)',
                     'Address Number (float)',
                     label='number')

    #compare_cl.numeric('Address Number (float)', 'Address Number (float)',
    #                   offset=3, scale=2,
    #                   label='number')

    compare_cl.string('Street Name',
                      'Street Name',
                      method='levenshtein',
                      threshold=0.9,
                      label='street')

    print("Calculating similarities")
    features = compare_cl.compute(pairs, df_full)
    features.to_pickle(FEATURES_OUTPUT_FILE)

    return features
Beispiel #13
0
    def __init__(self,
                 data,
                 new_data,
                 name,
                 n_sample=50000,
                 bin_size=500,
                 block=None,
                 method='jarowinkler',
                 threshold=0.93):
        super(Linkage, self).__init__()
        self.data = data
        self.new_data = pd.DataFrame(new_data)
        self.n_sample = n_sample
        self.bin_size = bin_size
        self.name = name
        self.block = block
        self.method = method
        self.threshold = threshold

        self.sample = data.sample(n=self.n_sample,
                                  replace=False,
                                  random_state=0)

        if self.block != None:
            self.indexer = recordlinkage.BlockIndex(on=self.block)
        else:
            self.indexer = recordlinkage.FullIndex()
        self.compare_cl = recordlinkage.Compare(n_jobs=4)
        self.compare_cl.string(self.name,
                               self.name,
                               method=self.method,
                               threshold=self.threshold,
                               label=self.name)

        if self.new_data.empty:
            self.List = list(
                itertools.combinations(
                    np.split(self.sample,
                             indices_or_sections=round(
                                 self.n_sample / self.bin_size, 0)), 2))
        else:
            self.Linst = list(
                np.split(self.sample,
                         indices_or_sections=round(
                             self.n_sample / self.bin_size, 0)))
        self.results_df = pd.DataFrame(
            columns=['pairs', 'company_1', 'company_2'])
        self.results_tmp = None
    def test_dates_with_missings(self):
        """
        Test:
            - Default value
            - numeric value
            - numpy.nan
            - string value
        """

        self.A['test_dates'] = pandas.to_datetime(
            ['2005/11/23', np.nan, '2004/11/23', '2010/01/10', '2010/10/30'])
        self.B['test_dates'] = pandas.to_datetime([
            '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01', '2010/9/30'
        ])

        comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

        # Missing values as default
        print("Missing values as default")
        result = comp.date('test_dates', 'test_dates')
        expected = pandas.Series([1, 0, 0, 0.5, 0.5], index=self.index_AB)
        pdt.assert_series_equal(result, expected)

        # Missing values as 0
        print("Missing values as 0")
        result = comp.date('test_dates', 'test_dates', missing_value=0)
        expected = pandas.Series([1, 0, 0, 0.5, 0.5], index=self.index_AB)
        pdt.assert_series_equal(result, expected)

        # Missing values as 123.45
        print("Missing values as 123.45 (float)")
        result = comp.date('test_dates', 'test_dates', missing_value=123.45)
        expected = pandas.Series([1, 123.45, 0, 0.5, 0.5], index=self.index_AB)
        pdt.assert_series_equal(result, expected)

        # Missing values as nan
        print("Missing values as numpy.nan")
        result = comp.date('test_dates', 'test_dates', missing_value=nan)
        expected = pandas.Series([1, nan, 0, 0.5, 0.5], index=self.index_AB)
        pdt.assert_series_equal(result, expected)

        # Missing values as string
        print("Missing values as string")
        result = comp.date('test_dates', 'test_dates', missing_value='str')
        expected = pandas.Series([1, 'str', 0, 0.5, 0.5],
                                 index=self.index_AB,
                                 dtype=object)
        pdt.assert_series_equal(result, expected)
    def test_geo_does_not_exist(self):

        # Utrecht, Amsterdam, Rotterdam (Cities in The Netherlands)
        A = DataFrame({
            'lat': [52.0842455, 52.3747388, 51.9280573],
            'lng': [5.0124516, 4.7585305, 4.4203581]
        })
        B = DataFrame({
            'lat': [52.3747388, 51.9280573, 52.0842455],
            'lng': [4.7585305, 4.4203581, 5.0124516]
        })
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        comp = recordlinkage.Compare()
        comp.geo('lat', 'lng', 'lat', 'lng', method='unknown')
        self.assertRaises(ValueError, comp.compute, ix, A, B)
    def compare_builder(self):
        compare_booths = recordlinkage.Compare()

        # Get Levenshtein score booth name
        compare_booths.string(self.locality_col,self.locality_col, method='levenshtein', threshold=None, label='name_lv_score')

        # Get Jaro-Winkler Score for booth name
        compare_booths.string(self.locality_col, self.locality_col, method='jarowinkler', threshold=None, label='name_jw_score')

        # Get Jaro-winkler score for the way the name is pronounced
        compare_booths.string('metaphone', 'metaphone', method='levenshtein', threshold=None, label='metaphone')

        # Get score for how far apart the booth numbers are
        compare_booths.numeric(self.booth_num_col, self.booth_num_col, label='booth_number_score', method="gauss", offset=3, scale=5)

        self.compare_booths = compare_booths
Beispiel #17
0
    def test_random_desc(self):

        df_a = pd.DataFrame({'v': list("abcde")})
        df_b = pd.DataFrame({'v': list("abcde")})

        pairs = Full().index(df_a, df_b)

        c = recordlinkage.Compare()
        c.exact("v", "v")
        c.add(RandomDiscrete(label='random'))
        cv = c.compute(pairs, df_a, df_b)

        assert isinstance(cv, pd.DataFrame)

        assert cv['random'].notnull().all()
        assert cv['random'].isin([0, 1]).all()
    def test_dates_with_missings(self):

        A = DataFrame({
            'col':
            to_datetime(
                ['2005/11/23', nan, '2004/11/23', '2010/01/10', '2010/10/30'])
        })
        B = DataFrame({
            'col':
            to_datetime([
                '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01',
                '2010/9/30'
            ])
        })
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        comp = recordlinkage.Compare()
        comp.date('col', 'col', label='m_')
        comp.date('col', 'col', missing_value=0, label='m_0')
        comp.date('col', 'col', missing_value=123.45, label='m_float')
        comp.date('col', 'col', missing_value=nan, label='m_na')
        comp.date('col', 'col', missing_value='str', label='m_str')
        result = comp.compute(ix, A, B)

        # Missing values as default
        expected = Series([1, 0, 0, 0.5, 0.5], index=ix, name='m_')
        pdt.assert_series_equal(result['m_'], expected)

        # Missing values as 0
        expected = Series([1, 0, 0, 0.5, 0.5], index=ix, name='m_0')
        pdt.assert_series_equal(result['m_0'], expected)

        # Missing values as 123.45
        expected = Series([1, 123.45, 0, 0.5, 0.5], index=ix, name='m_float')
        pdt.assert_series_equal(result['m_float'], expected)

        # Missing values as nan
        expected = Series([1, nan, 0, 0.5, 0.5], index=ix, name='m_na')
        pdt.assert_series_equal(result['m_na'], expected)

        # Missing values as string
        expected = Series([1, 'str', 0, 0.5, 0.5],
                          index=ix,
                          dtype=object,
                          name='m_str')
        pdt.assert_series_equal(result['m_str'], expected)
Beispiel #19
0
def record_link_schools():
    """
	This function performs record linkage on two dataframes: the critical
	mass dataframe and the retention rates dataframe. The record linkage
	is condicted on the name of the school. 

	Input: None
	Output:
		- link: a tuple containing the indices of retention dataframe
                and the critical mass dataframe; AND the best matches qgram scores
	"""
    critical_mass_df = calculate_critical_mass_var()
    retention_df = import_cleaned_retention(RETENTION_CLEAN)

    # set thresholds for comparing strings using qgram method
    school_name_thresh = 0.85

    # initialize a Record Linkage comparison object
    compare = rl.Compare()
    indexer = rl.FullIndex()  # No blocking available
    compare.string('school',
                   'school',
                   method='qgram',
                   threshold=school_name_thresh,
                   label='school_name_score')

    # make pairs
    pairs = indexer.index(retention_df, critical_mass_df)

    # compute record linkage scores
    features = compare.compute(pairs, retention_df, critical_mass_df)

    # set classification threshold
    school_name_classif_thresh = 1.0

    # Classification & Final Filtering
    best_matches = features[(features['school_name_score'] >=
                             school_name_classif_thresh)]

    # obtain the index values from best_matches
    index_array = best_matches.index.values

    # create tuple of indices and best matches df
    link = (index_array, best_matches)

    return link
def start_rl(df):
    """
    Doppelte Daten im Dataframe erkennen und Gesamt-Score pro Datensatz berechnen
    :param df: DataFrame mit Eingangsdaten
    :return: zwei DataFrames mit dem Ergebnis aller durchgefuehrten Vergleiche und den Treffern mit einem Score >= .99
    """

    # Indexation step
    indexer = rl.index.SortedNeighbourhood('MESSZEIT',
                                           window=5,
                                           block_on=['KENNUNG'])
    pairs = indexer.index(df)

    # Comparison step
    comparer = rl.Compare()
    comparer.exact('MESSZEIT', 'MESSZEIT', label='messzeit')
    comparer.exact('KENNUNG', 'KENNUNG', label='kennung')
    comparer.numeric('GEOGR_BREITE',
                     'GEOGR_BREITE',
                     method=u'gauss',
                     offset=0.0,
                     label='geogr_breite')
    comparer.numeric('GEOGR_LAENGE',
                     'GEOGR_LAENGE',
                     method=u'gauss',
                     offset=0.0,
                     label='geogr_laenge')
    comparer.numeric('HORIZONTALE_SICHT',
                     'HORIZONTALE_SICHT',
                     method=u'lin',
                     offset=10.0,
                     missing_value=1,
                     label='horizontale_sicht')
    comparer.add(CompareWetter('WETTER', 'WETTER', label='wetter2'))

    compared = comparer.compute(pairs, df)

    # prozentualer Gesamtscore pro Datensatz
    pcmax = compared.shape[1]  # col_count, 100%
    compared.loc[:, 'Score'] = 1 - (abs(compared.sum(axis=1) - pcmax) / pcmax)

    # Classification step
    matches = compared[(compared.messzeit == 1) & (compared.kennung == 1) &
                       (compared.Score >= .991)]

    return compared, matches
Beispiel #21
0
    def test_random_cont(self):

        df_a = pd.DataFrame({'v': list("abcde")})
        df_b = pd.DataFrame({'v': list("abcde")})

        pairs = Full().index(df_a, df_b)

        c = recordlinkage.Compare()
        c.exact("v", "v")
        c.add(RandomContinuous(label='random'))
        cv = c.compute(pairs, df_a, df_b)

        assert isinstance(cv, pd.DataFrame)

        assert cv['random'].notnull().all()
        assert cv['random'].min() >= 0.0
        assert cv['random'].max() <= 1.0
Beispiel #22
0
    def _test_logistic_transh(self, dataset, params):
        """Note: Zero aligned pairs are returned, require fixation."""
        model = dataset()
        logger = get_logger('RL.Test.LogisticTransH.' + str(model))
        entity, relation, triples, entity_pairs, true_pairs = model.get_er_model(
        )
        transh = TransH(entity,
                        relation,
                        triples,
                        entity_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transh.get_ent_embeddings()
        ent_embeddings = [
            np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0])
        ]
        trainDataA = pd.DataFrame(data=ent_embeddings)
        trainDataB = pd.DataFrame(data=ent_embeddings)

        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i), method='gauss')

        candidate_links = pd.MultiIndex.from_tuples(entity_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Features %s", str(features.describe()))

        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, true_pairs)

        result = logrg.predict(features)
        log_quality_results(logger, result, true_pairs, len(entity_pairs))

        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs)
        ir_metrics.log_metrics(logger, params)
Beispiel #23
0
def processing(df, sourceid):
    if sourceid == 1:
        postal_indexer = Block('PostCodeKey')
        postal_pairs = postal_indexer.index(df)
        for i in [20, 40, 60, 80, 100]:
            if (len(postal_pairs) / i) < 1000000:
                intervalparts = i
                break
            else:
                intervalparts = 100
# Get Interval Parts
        inter = intervals(intervalparts, len(postal_pairs))

        comp_postal = recordlinkage.Compare(n_jobs=20)
        comp_postal.string('BusinessNameKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BusinesNameCompare')
        comp_postal.string('TradestyleKey',
                           'BusinessNameKey',
                           method='jarowinkler',
                           label='BNTSCompare')
        comp_postal.string('AddressKey',
                           'AddressKey',
                           method='jarowinkler',
                           label='AddressCompare')

        cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df)
        cv_full = cv_full[
            ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True))
             | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True)))
            & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))]
        for i in range(1, len(inter) - 1):
            cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]],
                                     df)
            cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True))
                     | (cv.BNTSCompare.between(0.95, 1, inclusive=True)))
                    & (cv.AddressCompare.between(0.95, 1, inclusive=True))]
            frames = [cv_full, cv]
            cv_full = pd.concat(frames)
            del cv

#        print(df.columns)
#        print(cv_full.columns)
        return df, cv_full
Beispiel #24
0
    def test_instance_dedup(self):

        comp = recordlinkage.Compare()
        comp.string('given_name', 'given_name', method='jaro')
        comp.numeric('age', 'age', method='step', offset=3, origin=2)
        comp.numeric('age', 'age', method='step', offset=0, origin=2)
        result = comp.compute(self.index_AB, self.A)

        # returns a Series
        assert isinstance(result, DataFrame)

        # resulting series has a MultiIndex
        assert isinstance(result.index, MultiIndex)

        # indexnames are oke
        assert result.index.names == [self.A.index.name, self.B.index.name]

        assert len(result) == len(self.index_AB)
Beispiel #25
0
def generating_pairs_linkage_data(): 
    indexer = recordlinkage.Index()
    indexer.block('cuisine_type')
    pairs = indexer.index(restaurants, restaurants_new)

    comp_cl = recordlinkage.Compare()
    comp_cl.exact('city', 'city', label='city')
    comp_cl.exact('cuisine_type', 'cuisine_type', label='cuisine_type')
    comp_cl.string('name', 'name', label='name', threshold = 0.8) 

    potential_matches = comp_cl.compute(pairs, restaurants, restaurants_new)
    print(potential_matches)

    matches = potential_matches[potential_matches.sum(axis = 1) >= 3]
    matching_indices = matches.index.get_level_values(1)
    non_dup = restaurants_new[~restaurants_new.index.isin(matching_indices)]
    full_restaurants = restaurants.append(non_dup)
    print(full_restaurants)
    def test_string_algorithms(self, alg):

        A = DataFrame(
            {'col': [u'str_abc', u'str_abc', u'str_abc', nan, u'hsdkf']})
        B = DataFrame({'col': [u'str_abc', u'str_abd', u'jaskdfsd', nan, nan]})
        ix = MultiIndex.from_arrays([A.index.values, B.index.values])

        comp = recordlinkage.Compare()
        comp.string('col', 'col', method=alg, missing_value=0)
        result = comp.compute(ix, A, B)[0]

        self.assertFalse(result.isnull().any())

        self.assertTrue((result >= 0).all())
        self.assertTrue((result <= 1).all())

        self.assertTrue((result > 0).any())
        self.assertTrue((result < 1).any())
Beispiel #27
0
    def get_comparision_object(self):
        """
            Builds the Comparison Object for six fields.
            JaroWinkler Distance for Name, Surname & relation.
            Exact Match for YOB, Civil status and occupation.
            :return : compare_cl
            :rtype : recordlinkage.Compare
        """
        compare_cl = recordlinkage.Compare()

        fname = census_field_map[self.census_location][CensusFields.FIRST_NAME]
        compare_cl.string(fname,
                          fname,
                          method='jarowinkler',
                          threshold=0.85,
                          label='normalizedName')

        sname1 = census_field_map[self.census_location][CensusFields.SURNAME_1]
        compare_cl.string(sname1,
                          sname1,
                          method='jarowinkler',
                          threshold=0.85,
                          label='normalizedSurname1')

        yob = census_field_map[self.census_location][CensusFields.YOB]
        compare_cl.exact(yob, yob, label='yearOfBirth')

        civil = census_field_map[self.census_location][
            CensusFields.CIVIL_STATUS]
        compare_cl.exact(civil, civil, label='civilStatus')

        relation = census_field_map[self.census_location][
            CensusFields.RELATION]
        compare_cl.string(relation,
                          relation,
                          method='jarowinkler',
                          threshold=0.85,
                          label='normalizedRelation')

        occupation = census_field_map[self.census_location][
            CensusFields.OCCUPATION]
        compare_cl.exact(occupation, occupation, label='normalizedOccupation')

        return compare_cl
    def test_instance_linking(self):

        comp = recordlinkage.Compare()
        comp.string('given_name', 'given_name', method='jaro')
        comp.numeric('age', 'age', method='step', offset=3, origin=2)
        comp.numeric('age', 'age', method='step', offset=0, origin=2)
        result = comp.compute(self.index_AB, self.A, self.B)

        # returns a Series
        self.assertIsInstance(result, DataFrame)

        # resulting series has a MultiIndex
        self.assertIsInstance(result.index, MultiIndex)

        # indexnames are oke
        self.assertEqual(result.index.names,
                         [self.A.index.name, self.B.index.name])

        self.assertEqual(len(result), len(self.index_AB))
Beispiel #29
0
def build(m):
    ## 读取数据集AB
    dfA = pd.read_csv(R"dataset-A.csv", index_col=0, encoding="utf_8")
    dfB = pd.read_csv(R"dataset-B.csv", index_col=0, encoding="utf_8")
    links_true = pd.read_csv("true-matches.csv", header=None)
    arrays = [links_true[0].values.tolist(), links_true[1].values.tolist()]
    links_true = pd.MultiIndex.from_arrays(arrays)
    # Indexation step
    indexer = recordlinkage.Index()
    ## 这里应该就是你的文档中的block,我用了下面的2个,可以改
    # rec_id,first_name,middle_name,last_name,gender,current_age,birth_date,street_address,suburb,postcode,state,phone,email

    indexer.block(("first_name", "last_name"))
    candidate_links = indexer.index(dfA, dfB)
    print("candidate_links:{}".format(candidate_links))
    """
    # 1  blocking:
            simpleBlocking, phoneticBlocking, slkBlocking, 可以选择simpleBlocking。
            	choice	of blocking	keys: 
    """
    # Comparison step
    compare_cl = recordlinkage.Compare()
    # 下面是匹配的条件,具体有哪些参考文档中第4点
    compare_cl.string('first_name', 'first_name', method=m, threshold=0.85, label='first_name')
    compare_cl.string('middle_name', 'middle_name', method=m, threshold=0.85, label='middle_name')
    compare_cl.string('last_name', 'last_name', label='last_name', method=m, threshold=0.85)
    compare_cl.string('street_address', 'street_address', label='street_address', method=m, threshold=0.85)
    compare_cl.exact('gender', 'gender', label='gender')
    compare_cl.string('birth_date', 'birth_date', label='birth_date', method=m, threshold=0.85)

    if not os.path.exists("./results"):
        os.makedirs("./results")

    features = compare_cl.compute(candidate_links, dfA, dfB)
    features.to_csv("./results/features_{}.csv".format(m))
    # 选择准确匹配的个数,满足阈值的决策为匹配成功的,
    num = 3
    matches = features[features.sum(axis=1) >= num]

    # 输出方法,直接调用文件夹中的代码,输出文件是out.csv
    save_linkage_set("./results/out_{}.csv".format(m), matches.index)
    evalution(X_data=matches, links_true=links_true)
    def test_fuzzy_different_labels(self):

        comp = recordlinkage.Compare(self.index_AB, self.A, self.B)

        for alg in STRING_SIM_ALGORITHMS:

            print('The {} algorithm'.format(alg))

            # Missing values
            # Change in future (should work without method)
            result = comp.string('given_name',
                                 'given_name',
                                 method=alg,
                                 missing_value=0)

            print(result)

            self.assertFalse(result.isnull().all())
            self.assertTrue((result[result.notnull()] >= 0).all())
            self.assertTrue((result[result.notnull()] <= 1).all())