def test_geo(self): comp = recordlinkage.Compare(self.index_AB, self.A, self.B) # Missing values result = comp.geo('age', 'age', 'age', 'age', method='linear', offset=2, scale=2) self.assertFalse(result.isnull().all())
def test_indexing_types(self): # test the two types of indexing # this test needs improvement A = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']}) B = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']}) B_reversed = B[::-1].copy() ix = MultiIndex.from_arrays([np.arange(5), np.arange(5)]) # test with label indexing type comp_label = recordlinkage.Compare(indexing_type='label') comp_label.exact('col', 'col') result_label = comp_label.compute(ix, A, B_reversed) # test with position indexing type comp_position = recordlinkage.Compare(indexing_type='position') comp_position.exact('col', 'col') result_position = comp_position.compute(ix, A, B_reversed) assert (result_position.values == 1).all(axis=0) pdt.assert_frame_equal(result_label, result_position)
def test_numeric_algorithms(self, alg): A = DataFrame({'col': [1, 1, 1, 1, 1]}) B = DataFrame({'col': [1, 2, 3, 4, 5]}) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) comp = recordlinkage.Compare() comp.numeric('col', 'col', method='step', offset=1, label='step') comp.numeric( 'col', 'col', method='linear', offset=1, scale=2, label='linear') comp.numeric( 'col', 'col', method='squared', offset=1, scale=2, label='squared') comp.numeric( 'col', 'col', method='exp', offset=1, scale=2, label='exp') comp.numeric( 'col', 'col', method='gauss', offset=1, scale=2, label='gauss') result_df = comp.compute(ix, A, B) result = result_df[alg] # All values between 0 and 1. assert (result >= 0.0).all() assert (result <= 1.0).all() if alg is not 'step': print(alg) print(result) # sim(scale) = 0.5 expected_bool = Series( [False, False, False, True, False], index=ix, name=alg) pdt.assert_series_equal(result == 0.5, expected_bool) # sim(offset) = 1 expected_bool = Series( [True, True, False, False, False], index=ix, name=alg) pdt.assert_series_equal(result == 1.0, expected_bool) # sim(scale) larger than 0.5 expected_bool = Series( [False, False, True, False, False], index=ix, name=alg) pdt.assert_series_equal((result > 0.5) & (result < 1.0), expected_bool) # sim(scale) smaller than 0.5 expected_bool = Series( [False, False, False, False, True], index=ix, name=alg) pdt.assert_series_equal((result < 0.5) & (result >= 0.0), expected_bool)
def test_compare_custom_vectorized_dedup(self): A = DataFrame({'col': ['abc', 'abc', 'abc', 'abc', 'abc']}) ix = MultiIndex.from_arrays([[1, 2, 3, 4, 5], [2, 3, 4, 5, 1]]) # test without label comp = recordlinkage.Compare() comp.compare_vectorized(lambda s1, s2: np.ones(len(s1), dtype=np.int), 'col', 'col') result = comp.compute(ix, A) expected = DataFrame([1, 1, 1, 1, 1], index=ix) pdt.assert_frame_equal(result, expected) # test with label comp = recordlinkage.Compare() comp.compare_vectorized( lambda s1, s2: np.ones(len(s1), dtype=np.int), 'col', 'col', label='test') result = comp.compute(ix, A) expected = DataFrame([1, 1, 1, 1, 1], index=ix, columns=['test']) pdt.assert_frame_equal(result, expected)
def test_parallel_comparing(self): # use single job comp = recordlinkage.Compare(n_jobs=1) comp.exact('given_name', 'given_name', label='my_feature_label') result_single = comp.compute(self.index_AB, self.A, self.B) result_single.sort_index(inplace=True) # use two jobs comp = recordlinkage.Compare(n_jobs=2) comp.exact('given_name', 'given_name', label='my_feature_label') result_2processes = comp.compute(self.index_AB, self.A, self.B) result_2processes.sort_index(inplace=True) # use two jobs comp = recordlinkage.Compare(n_jobs=4) comp.exact('given_name', 'given_name', label='my_feature_label') result_4processes = comp.compute(self.index_AB, self.A, self.B) result_4processes.sort_index(inplace=True) # compare results pdt.assert_frame_equal(result_single, result_2processes) pdt.assert_frame_equal(result_single, result_4processes)
def test_numeric_does_not_exist(self): """ Raise error is the algorithm doesn't exist. """ A = DataFrame({'col': [1, 1, 1, nan, 0]}) B = DataFrame({'col': [1, 1, 1, nan, nan]}) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) comp = recordlinkage.Compare() with self.assertRaises(ValueError): comp.numeric('col', 'col', method='unknown_algorithm') comp.compute(ix, A, B)
def test_repr(self): comp = recordlinkage.Compare() comp.exact('given_name', 'given_name') comp.string('given_name', 'given_name', method='jaro') comp.numeric('age', 'age', method='step', offset=3, origin=2) comp.numeric('age', 'age', method='step', offset=0, origin=2) c_str = str(comp) c_repr = repr(comp) assert c_str == c_repr start_str = '<{}'.format(comp.__class__.__name__) assert c_str.startswith(start_str)
def ProcessData(patientDataList, fetchedHospitalData): # Read from the directory filelist = pd.read_csv( '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/' + fetchedHospitalData) # Indexation step indexer = p.Index() indexer.add(Block(left_on='fatherName', right_on='fatherName')) candidate_links = indexer.index(patientDataList, filelist) # print((candidate_links)) # Comparison step compare_cl = p.Compare() # compare_cl.exact('_id','_id',label='_id') compare_cl.exact('name', 'name', label='name') compare_cl.exact('fatherName', 'fatherName', label='fatherName') compare_cl.exact('grandFatherName', 'grandFatherName', label='grandFatherName') compare_cl.exact('gender', 'gender', label='gender') compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth') compare_cl.exact('dayOfBirth', 'dayOfBirth', label='dayOfBirth') compare_cl.exact('monthOfBirth', 'monthOfBirth', label='monthOfBirth') compare_cl.exact('yearOfBirth', 'yearOfBirth', label='yearOfBirth') compare_cl.exact('age', 'age', label='age') # compare_cl.exact('address','address',label='address') # compare_cl.exact('phoneNumber','phoneNumber',label='phoneNumber') features = compare_cl.compute(candidate_links, patientDataList, filelist) if features.empty: return None else: # Classification step ''' Use the KMeans Classifier This classifier is equivalent to the Unsupervised record linkage approach ''' # # classifier = p.LogisticRegressionClassifier(coefficients=coefficients,intercept=intercept) classifier = p.LogisticRegressionClassifier() classifier.fit(golden_pairs, golden_matches_index) links = classifier.predict(features) return links
def time_global_large(self): c_compare = rl.Compare(self.pairs_large, self.A, self.B) c_compare.string('given_name', 'given_name', method='jaro') c_compare.string('surname', 'surname', method='jarowinkler', threshold=0.85) c_compare.date('date_of_birth', 'date_of_birth') c_compare.exact('suburb', 'suburb') c_compare.exact('state', 'state') c_compare.string('address_1', 'address_1', method='levenshtein', threshold=0.85)
def test_date_incorrect_dtype(self): A = DataFrame({ 'col': ['2005/11/23', nan, '2004/11/23', '2010/01/10', '2010/10/30'] }) B = DataFrame({ 'col': [ '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01', '2010/9/30' ] }) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) A['col1'] = to_datetime(A['col']) B['col1'] = to_datetime(B['col']) comp = recordlinkage.Compare() comp.date('col', 'col1') self.assertRaises(ValueError, comp.compute, ix, A, B) comp = recordlinkage.Compare() comp.date('col1', 'col') self.assertRaises(ValueError, comp.compute, ix, A, B)
def test_memory_low(self): c = recordlinkage.Compare(self.index_AB, self.A, self.B, low_memory=True) c.numeric('age', 'age', method='linear', offset=0, scale=3) c.exact('given_name', 'given_name') c.string('lastname', 'lastname', method='levenshtein') c.string('street', 'street', method='levenshtein') nrows, ncols = c.vectors.shape self.assertEqual(nrows, len(self.index_AB)) self.assertEqual(ncols, 4)
def compute_record_linkage(df_full): """Given the fully-concatenated table of records calculate which pairs are address-matches using fuzzy matching on street name, unit number and zipcode """ print("Setting up blocking for pairwise comparisons") blocking_indices = [ recordlinkage.BlockIndex(on="APN (int)"), recordlinkage.BlockIndex( on=["Address Number (float)", "Zip Code (int)"]), ] print("Finding blocked pairs") pairs = None for df_subset in tqdm(np.array_split(df_full, 10)): for bi in blocking_indices: _new_pairs = bi.index(df_full, df_subset) if pairs is not None: pairs = pairs.union(_new_pairs) else: pairs = _new_pairs print("Setting up similarity calculations") compare_cl = recordlinkage.Compare() compare_cl.exact('APN (int)', 'APN (int)', label='APN') compare_cl.exact('Zip Code (int)', 'Zip Code (int)', label='Zip') compare_cl.exact('Address Number (float)', 'Address Number (float)', label='number') #compare_cl.numeric('Address Number (float)', 'Address Number (float)', # offset=3, scale=2, # label='number') compare_cl.string('Street Name', 'Street Name', method='levenshtein', threshold=0.9, label='street') print("Calculating similarities") features = compare_cl.compute(pairs, df_full) features.to_pickle(FEATURES_OUTPUT_FILE) return features
def __init__(self, data, new_data, name, n_sample=50000, bin_size=500, block=None, method='jarowinkler', threshold=0.93): super(Linkage, self).__init__() self.data = data self.new_data = pd.DataFrame(new_data) self.n_sample = n_sample self.bin_size = bin_size self.name = name self.block = block self.method = method self.threshold = threshold self.sample = data.sample(n=self.n_sample, replace=False, random_state=0) if self.block != None: self.indexer = recordlinkage.BlockIndex(on=self.block) else: self.indexer = recordlinkage.FullIndex() self.compare_cl = recordlinkage.Compare(n_jobs=4) self.compare_cl.string(self.name, self.name, method=self.method, threshold=self.threshold, label=self.name) if self.new_data.empty: self.List = list( itertools.combinations( np.split(self.sample, indices_or_sections=round( self.n_sample / self.bin_size, 0)), 2)) else: self.Linst = list( np.split(self.sample, indices_or_sections=round( self.n_sample / self.bin_size, 0))) self.results_df = pd.DataFrame( columns=['pairs', 'company_1', 'company_2']) self.results_tmp = None
def test_dates_with_missings(self): """ Test: - Default value - numeric value - numpy.nan - string value """ self.A['test_dates'] = pandas.to_datetime( ['2005/11/23', np.nan, '2004/11/23', '2010/01/10', '2010/10/30']) self.B['test_dates'] = pandas.to_datetime([ '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01', '2010/9/30' ]) comp = recordlinkage.Compare(self.index_AB, self.A, self.B) # Missing values as default print("Missing values as default") result = comp.date('test_dates', 'test_dates') expected = pandas.Series([1, 0, 0, 0.5, 0.5], index=self.index_AB) pdt.assert_series_equal(result, expected) # Missing values as 0 print("Missing values as 0") result = comp.date('test_dates', 'test_dates', missing_value=0) expected = pandas.Series([1, 0, 0, 0.5, 0.5], index=self.index_AB) pdt.assert_series_equal(result, expected) # Missing values as 123.45 print("Missing values as 123.45 (float)") result = comp.date('test_dates', 'test_dates', missing_value=123.45) expected = pandas.Series([1, 123.45, 0, 0.5, 0.5], index=self.index_AB) pdt.assert_series_equal(result, expected) # Missing values as nan print("Missing values as numpy.nan") result = comp.date('test_dates', 'test_dates', missing_value=nan) expected = pandas.Series([1, nan, 0, 0.5, 0.5], index=self.index_AB) pdt.assert_series_equal(result, expected) # Missing values as string print("Missing values as string") result = comp.date('test_dates', 'test_dates', missing_value='str') expected = pandas.Series([1, 'str', 0, 0.5, 0.5], index=self.index_AB, dtype=object) pdt.assert_series_equal(result, expected)
def test_geo_does_not_exist(self): # Utrecht, Amsterdam, Rotterdam (Cities in The Netherlands) A = DataFrame({ 'lat': [52.0842455, 52.3747388, 51.9280573], 'lng': [5.0124516, 4.7585305, 4.4203581] }) B = DataFrame({ 'lat': [52.3747388, 51.9280573, 52.0842455], 'lng': [4.7585305, 4.4203581, 5.0124516] }) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) comp = recordlinkage.Compare() comp.geo('lat', 'lng', 'lat', 'lng', method='unknown') self.assertRaises(ValueError, comp.compute, ix, A, B)
def compare_builder(self): compare_booths = recordlinkage.Compare() # Get Levenshtein score booth name compare_booths.string(self.locality_col,self.locality_col, method='levenshtein', threshold=None, label='name_lv_score') # Get Jaro-Winkler Score for booth name compare_booths.string(self.locality_col, self.locality_col, method='jarowinkler', threshold=None, label='name_jw_score') # Get Jaro-winkler score for the way the name is pronounced compare_booths.string('metaphone', 'metaphone', method='levenshtein', threshold=None, label='metaphone') # Get score for how far apart the booth numbers are compare_booths.numeric(self.booth_num_col, self.booth_num_col, label='booth_number_score', method="gauss", offset=3, scale=5) self.compare_booths = compare_booths
def test_random_desc(self): df_a = pd.DataFrame({'v': list("abcde")}) df_b = pd.DataFrame({'v': list("abcde")}) pairs = Full().index(df_a, df_b) c = recordlinkage.Compare() c.exact("v", "v") c.add(RandomDiscrete(label='random')) cv = c.compute(pairs, df_a, df_b) assert isinstance(cv, pd.DataFrame) assert cv['random'].notnull().all() assert cv['random'].isin([0, 1]).all()
def test_dates_with_missings(self): A = DataFrame({ 'col': to_datetime( ['2005/11/23', nan, '2004/11/23', '2010/01/10', '2010/10/30']) }) B = DataFrame({ 'col': to_datetime([ '2005/11/23', '2010/12/31', '2005/11/23', '2010/10/01', '2010/9/30' ]) }) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) comp = recordlinkage.Compare() comp.date('col', 'col', label='m_') comp.date('col', 'col', missing_value=0, label='m_0') comp.date('col', 'col', missing_value=123.45, label='m_float') comp.date('col', 'col', missing_value=nan, label='m_na') comp.date('col', 'col', missing_value='str', label='m_str') result = comp.compute(ix, A, B) # Missing values as default expected = Series([1, 0, 0, 0.5, 0.5], index=ix, name='m_') pdt.assert_series_equal(result['m_'], expected) # Missing values as 0 expected = Series([1, 0, 0, 0.5, 0.5], index=ix, name='m_0') pdt.assert_series_equal(result['m_0'], expected) # Missing values as 123.45 expected = Series([1, 123.45, 0, 0.5, 0.5], index=ix, name='m_float') pdt.assert_series_equal(result['m_float'], expected) # Missing values as nan expected = Series([1, nan, 0, 0.5, 0.5], index=ix, name='m_na') pdt.assert_series_equal(result['m_na'], expected) # Missing values as string expected = Series([1, 'str', 0, 0.5, 0.5], index=ix, dtype=object, name='m_str') pdt.assert_series_equal(result['m_str'], expected)
def record_link_schools(): """ This function performs record linkage on two dataframes: the critical mass dataframe and the retention rates dataframe. The record linkage is condicted on the name of the school. Input: None Output: - link: a tuple containing the indices of retention dataframe and the critical mass dataframe; AND the best matches qgram scores """ critical_mass_df = calculate_critical_mass_var() retention_df = import_cleaned_retention(RETENTION_CLEAN) # set thresholds for comparing strings using qgram method school_name_thresh = 0.85 # initialize a Record Linkage comparison object compare = rl.Compare() indexer = rl.FullIndex() # No blocking available compare.string('school', 'school', method='qgram', threshold=school_name_thresh, label='school_name_score') # make pairs pairs = indexer.index(retention_df, critical_mass_df) # compute record linkage scores features = compare.compute(pairs, retention_df, critical_mass_df) # set classification threshold school_name_classif_thresh = 1.0 # Classification & Final Filtering best_matches = features[(features['school_name_score'] >= school_name_classif_thresh)] # obtain the index values from best_matches index_array = best_matches.index.values # create tuple of indices and best matches df link = (index_array, best_matches) return link
def start_rl(df): """ Doppelte Daten im Dataframe erkennen und Gesamt-Score pro Datensatz berechnen :param df: DataFrame mit Eingangsdaten :return: zwei DataFrames mit dem Ergebnis aller durchgefuehrten Vergleiche und den Treffern mit einem Score >= .99 """ # Indexation step indexer = rl.index.SortedNeighbourhood('MESSZEIT', window=5, block_on=['KENNUNG']) pairs = indexer.index(df) # Comparison step comparer = rl.Compare() comparer.exact('MESSZEIT', 'MESSZEIT', label='messzeit') comparer.exact('KENNUNG', 'KENNUNG', label='kennung') comparer.numeric('GEOGR_BREITE', 'GEOGR_BREITE', method=u'gauss', offset=0.0, label='geogr_breite') comparer.numeric('GEOGR_LAENGE', 'GEOGR_LAENGE', method=u'gauss', offset=0.0, label='geogr_laenge') comparer.numeric('HORIZONTALE_SICHT', 'HORIZONTALE_SICHT', method=u'lin', offset=10.0, missing_value=1, label='horizontale_sicht') comparer.add(CompareWetter('WETTER', 'WETTER', label='wetter2')) compared = comparer.compute(pairs, df) # prozentualer Gesamtscore pro Datensatz pcmax = compared.shape[1] # col_count, 100% compared.loc[:, 'Score'] = 1 - (abs(compared.sum(axis=1) - pcmax) / pcmax) # Classification step matches = compared[(compared.messzeit == 1) & (compared.kennung == 1) & (compared.Score >= .991)] return compared, matches
def test_random_cont(self): df_a = pd.DataFrame({'v': list("abcde")}) df_b = pd.DataFrame({'v': list("abcde")}) pairs = Full().index(df_a, df_b) c = recordlinkage.Compare() c.exact("v", "v") c.add(RandomContinuous(label='random')) cv = c.compute(pairs, df_a, df_b) assert isinstance(cv, pd.DataFrame) assert cv['random'].notnull().all() assert cv['random'].min() >= 0.0 assert cv['random'].max() <= 1.0
def _test_logistic_transh(self, dataset, params): """Note: Zero aligned pairs are returned, require fixation.""" model = dataset() logger = get_logger('RL.Test.LogisticTransH.' + str(model)) entity, relation, triples, entity_pairs, true_pairs = model.get_er_model( ) transh = TransH(entity, relation, triples, entity_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() ent_embeddings = [ np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0]) ] trainDataA = pd.DataFrame(data=ent_embeddings) trainDataB = pd.DataFrame(data=ent_embeddings) compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i), method='gauss') candidate_links = pd.MultiIndex.from_tuples(entity_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Features %s", str(features.describe())) logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, true_pairs) result = logrg.predict(features) log_quality_results(logger, result, true_pairs, len(entity_pairs)) prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) ir_metrics.log_metrics(logger, params)
def processing(df, sourceid): if sourceid == 1: postal_indexer = Block('PostCodeKey') postal_pairs = postal_indexer.index(df) for i in [20, 40, 60, 80, 100]: if (len(postal_pairs) / i) < 1000000: intervalparts = i break else: intervalparts = 100 # Get Interval Parts inter = intervals(intervalparts, len(postal_pairs)) comp_postal = recordlinkage.Compare(n_jobs=20) comp_postal.string('BusinessNameKey', 'BusinessNameKey', method='jarowinkler', label='BusinesNameCompare') comp_postal.string('TradestyleKey', 'BusinessNameKey', method='jarowinkler', label='BNTSCompare') comp_postal.string('AddressKey', 'AddressKey', method='jarowinkler', label='AddressCompare') cv_full = comp_postal.compute(postal_pairs[0:inter[1]], df) cv_full = cv_full[ ((cv_full.BusinesNameCompare.between(0.95, 1, inclusive=True)) | (cv_full.BNTSCompare.between(0.95, 1, inclusive=True))) & (cv_full.AddressCompare.between(0.95, 1, inclusive=True))] for i in range(1, len(inter) - 1): cv = comp_postal.compute(postal_pairs[inter[i] + 1:inter[i + 1]], df) cv = cv[((cv.BusinesNameCompare.between(0.95, 1, inclusive=True)) | (cv.BNTSCompare.between(0.95, 1, inclusive=True))) & (cv.AddressCompare.between(0.95, 1, inclusive=True))] frames = [cv_full, cv] cv_full = pd.concat(frames) del cv # print(df.columns) # print(cv_full.columns) return df, cv_full
def test_instance_dedup(self): comp = recordlinkage.Compare() comp.string('given_name', 'given_name', method='jaro') comp.numeric('age', 'age', method='step', offset=3, origin=2) comp.numeric('age', 'age', method='step', offset=0, origin=2) result = comp.compute(self.index_AB, self.A) # returns a Series assert isinstance(result, DataFrame) # resulting series has a MultiIndex assert isinstance(result.index, MultiIndex) # indexnames are oke assert result.index.names == [self.A.index.name, self.B.index.name] assert len(result) == len(self.index_AB)
def generating_pairs_linkage_data(): indexer = recordlinkage.Index() indexer.block('cuisine_type') pairs = indexer.index(restaurants, restaurants_new) comp_cl = recordlinkage.Compare() comp_cl.exact('city', 'city', label='city') comp_cl.exact('cuisine_type', 'cuisine_type', label='cuisine_type') comp_cl.string('name', 'name', label='name', threshold = 0.8) potential_matches = comp_cl.compute(pairs, restaurants, restaurants_new) print(potential_matches) matches = potential_matches[potential_matches.sum(axis = 1) >= 3] matching_indices = matches.index.get_level_values(1) non_dup = restaurants_new[~restaurants_new.index.isin(matching_indices)] full_restaurants = restaurants.append(non_dup) print(full_restaurants)
def test_string_algorithms(self, alg): A = DataFrame( {'col': [u'str_abc', u'str_abc', u'str_abc', nan, u'hsdkf']}) B = DataFrame({'col': [u'str_abc', u'str_abd', u'jaskdfsd', nan, nan]}) ix = MultiIndex.from_arrays([A.index.values, B.index.values]) comp = recordlinkage.Compare() comp.string('col', 'col', method=alg, missing_value=0) result = comp.compute(ix, A, B)[0] self.assertFalse(result.isnull().any()) self.assertTrue((result >= 0).all()) self.assertTrue((result <= 1).all()) self.assertTrue((result > 0).any()) self.assertTrue((result < 1).any())
def get_comparision_object(self): """ Builds the Comparison Object for six fields. JaroWinkler Distance for Name, Surname & relation. Exact Match for YOB, Civil status and occupation. :return : compare_cl :rtype : recordlinkage.Compare """ compare_cl = recordlinkage.Compare() fname = census_field_map[self.census_location][CensusFields.FIRST_NAME] compare_cl.string(fname, fname, method='jarowinkler', threshold=0.85, label='normalizedName') sname1 = census_field_map[self.census_location][CensusFields.SURNAME_1] compare_cl.string(sname1, sname1, method='jarowinkler', threshold=0.85, label='normalizedSurname1') yob = census_field_map[self.census_location][CensusFields.YOB] compare_cl.exact(yob, yob, label='yearOfBirth') civil = census_field_map[self.census_location][ CensusFields.CIVIL_STATUS] compare_cl.exact(civil, civil, label='civilStatus') relation = census_field_map[self.census_location][ CensusFields.RELATION] compare_cl.string(relation, relation, method='jarowinkler', threshold=0.85, label='normalizedRelation') occupation = census_field_map[self.census_location][ CensusFields.OCCUPATION] compare_cl.exact(occupation, occupation, label='normalizedOccupation') return compare_cl
def test_instance_linking(self): comp = recordlinkage.Compare() comp.string('given_name', 'given_name', method='jaro') comp.numeric('age', 'age', method='step', offset=3, origin=2) comp.numeric('age', 'age', method='step', offset=0, origin=2) result = comp.compute(self.index_AB, self.A, self.B) # returns a Series self.assertIsInstance(result, DataFrame) # resulting series has a MultiIndex self.assertIsInstance(result.index, MultiIndex) # indexnames are oke self.assertEqual(result.index.names, [self.A.index.name, self.B.index.name]) self.assertEqual(len(result), len(self.index_AB))
def build(m): ## 读取数据集AB dfA = pd.read_csv(R"dataset-A.csv", index_col=0, encoding="utf_8") dfB = pd.read_csv(R"dataset-B.csv", index_col=0, encoding="utf_8") links_true = pd.read_csv("true-matches.csv", header=None) arrays = [links_true[0].values.tolist(), links_true[1].values.tolist()] links_true = pd.MultiIndex.from_arrays(arrays) # Indexation step indexer = recordlinkage.Index() ## 这里应该就是你的文档中的block,我用了下面的2个,可以改 # rec_id,first_name,middle_name,last_name,gender,current_age,birth_date,street_address,suburb,postcode,state,phone,email indexer.block(("first_name", "last_name")) candidate_links = indexer.index(dfA, dfB) print("candidate_links:{}".format(candidate_links)) """ # 1 blocking: simpleBlocking, phoneticBlocking, slkBlocking, 可以选择simpleBlocking。 choice of blocking keys: """ # Comparison step compare_cl = recordlinkage.Compare() # 下面是匹配的条件,具体有哪些参考文档中第4点 compare_cl.string('first_name', 'first_name', method=m, threshold=0.85, label='first_name') compare_cl.string('middle_name', 'middle_name', method=m, threshold=0.85, label='middle_name') compare_cl.string('last_name', 'last_name', label='last_name', method=m, threshold=0.85) compare_cl.string('street_address', 'street_address', label='street_address', method=m, threshold=0.85) compare_cl.exact('gender', 'gender', label='gender') compare_cl.string('birth_date', 'birth_date', label='birth_date', method=m, threshold=0.85) if not os.path.exists("./results"): os.makedirs("./results") features = compare_cl.compute(candidate_links, dfA, dfB) features.to_csv("./results/features_{}.csv".format(m)) # 选择准确匹配的个数,满足阈值的决策为匹配成功的, num = 3 matches = features[features.sum(axis=1) >= num] # 输出方法,直接调用文件夹中的代码,输出文件是out.csv save_linkage_set("./results/out_{}.csv".format(m), matches.index) evalution(X_data=matches, links_true=links_true)
def test_fuzzy_different_labels(self): comp = recordlinkage.Compare(self.index_AB, self.A, self.B) for alg in STRING_SIM_ALGORITHMS: print('The {} algorithm'.format(alg)) # Missing values # Change in future (should work without method) result = comp.string('given_name', 'given_name', method=alg, missing_value=0) print(result) self.assertFalse(result.isnull().all()) self.assertTrue((result[result.notnull()] >= 0).all()) self.assertTrue((result[result.notnull()] <= 1).all())