def test_census(self): logger = get_logger('RL.Test.KmeansClustering.CENSUS') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Features %s", str(features.describe())) # Train K-Means Classifier logrg = recordlinkage.KMeansClassifier(algorithm='full', max_iter=1000, random_state=42) logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links))
def test_febrl(self): logger = get_logger('RL.Test.KmeansClustering.FEBRL') febrl = FEBRL() compare_cl = febrl.get_comparision_object() features = compare_cl.compute(febrl.candidate_links, febrl.trainDataA, febrl.trainDataB) logger.info("Features %s", str(features.describe())) # Train K-Means Classifier logrg = recordlinkage.KMeansClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, febrl.true_links, len(febrl.candidate_links)) #Test the classifier compare_cl = febrl.get_comparision_object() features = compare_cl.compute(febrl.test_links, febrl.testDataA, febrl.testDataB) logger.info("Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, febrl.true_test_links, len(febrl.test_links))
def test_kmeans_manual(self): """KMeansClassifier with manual cluster centers""" # Make random test data. np.random.seed(535) manual_mcc = list(np.random.randn(self.X_train.shape[1])) manual_nmcc = list(np.random.randn(self.X_train.shape[1])) # Initialize the KMeansClassifier kmeans = rl.KMeansClassifier() # Check if the cluster centers are None assert not hasattr(kmeans, 'match_cluster_center') assert not hasattr(kmeans, 'nonmatch_cluster_center') # Set the cluster centers kmeans.match_cluster_center = manual_mcc kmeans.nonmatch_cluster_center = manual_nmcc # Perform the prediction kmeans.predict(self.X_test) # Check the match clusters mcc = kmeans.match_cluster_center nmcc = kmeans.nonmatch_cluster_center assert_almost_equal(mcc, manual_mcc) assert_almost_equal(nmcc, manual_nmcc)
def create_and_train_kmeans(): """ Creates and trains a KMeans Classifier """ classifier = rl.KMeansClassifier() classifier.learn(features) return classifier
def test_cora(self): logger = get_logger('RL.Test.KmeansClustering.CORA') #Read Train data in dataset A & B cora = Cora() ## Extarct Features compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.candidate_links, cora.trainDataA, cora.trainDataB) logger.info("Features %s", str(features.describe())) # Train K-Means Classifier logrg = recordlinkage.KMeansClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, cora.true_links, len(cora.candidate_links)) #Test the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.test_links, cora.testDataA, cora.testDataB) logger.info("Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_test_links, len(cora.test_links))
def test_kmeans_no_training_data(self): """ Kmeans, no training data""" kmeans = recordlinkage.KMeansClassifier() with pytest.raises(ValueError): kmeans.learn(pandas.DataFrame(columns=self.y_train.columns))
def test_kmeans_manual(self): """KMeansClassifier with manual cluster centers""" # Make random test data. numpy.random.seed(535) manual_mcc = list(numpy.random.randn(self.y_train.shape[1])) manual_nmcc = list(numpy.random.randn(self.y_train.shape[1])) # Initialize the KMeansClassifier kmeans = recordlinkage.KMeansClassifier() # Check if the cluster centers are None assert kmeans.match_cluster_center is None assert kmeans.nonmatch_cluster_center is None # Set the cluster centers kmeans.match_cluster_center = manual_mcc kmeans.nonmatch_cluster_center = manual_nmcc # Perform the prediction kmeans.predict(self.y) # Check the match clusters mcc = kmeans.match_cluster_center nmcc = kmeans.nonmatch_cluster_center assert mcc == manual_mcc assert nmcc == manual_nmcc
def test_kmeans_error(self): kmeans = rl.KMeansClassifier() kmeans.fit(self.X_train) # There are no probabilities with pytest.raises(AttributeError): kmeans.prob(self.X_train)
def test_kmeans(self): kmeans = rl.KMeansClassifier() kmeans.fit(self.X_train) result = kmeans.predict(self.X_test) assert isinstance(result, pd.MultiIndex) assert result.shape[0] == 11670
def test_kmeans(self): kmeans = recordlinkage.KMeansClassifier() kmeans.learn(self.y_train) kmeans.predict(self.y) # There are no probabilities with pytest.raises(AttributeError): kmeans.prob(self.y)
def test_kmeans_not_trained(self): """ Raise an error if the classifier is not trained, but a prediction is asked. """ kmeans = recordlinkage.KMeansClassifier() with pytest.raises(Exception): kmeans.predict(self.y)
def test_kmean_parameters(self): kmeans = rl.KMeansClassifier() kmeans.fit(self.X_train) _, n_features = self.X_train.shape assert isinstance(kmeans.match_cluster_center, np.ndarray) assert kmeans.match_cluster_center.shape == (n_features, ) assert isinstance(kmeans.nonmatch_cluster_center, np.ndarray) assert kmeans.nonmatch_cluster_center.shape == (n_features, )
def link_reduce(from_rest: str, dfs: dict, window: int, th: float, classifier: str, thFusion: float) -> dict: dfs_copy = {from_rest: dfs[from_rest]} dfs_reduce = dfs.copy() # Make copy of dfs with from_rest moved on top for rest, df in dfs.items(): if rest == from_rest: continue else: dfs_copy[rest] = df.copy() for rest, df in dfs_copy.items(): for rr, ddf in dfs_reduce.items(): if rr == rest: continue else: columns_to_check = ['restaurant'] print(f"{rest} -> {rr}") if df['addressGoogle'].isnull().sum() != len(df['addressGoogle']) and ddf['addressGoogle'].isnull().sum() != len(ddf['addressGoogle']): columns_to_check.append('addressGoogle') if df['neighborhood'].isnull().sum() != len(df['neighborhood']) and ddf['neighborhood'].isnull().sum() != len(ddf['neighborhood']): columns_to_check.append('neighborhood') #print(f"\tcheck: {columns_to_check}") indexer = recordlinkage.Index() # 1 - INDEXING for col in columns_to_check: indexer.sortedneighbourhood( left_on=col, right_on=col, window=window) candidate_links = indexer.index(df, ddf) # 2 - COMPARISON compare_cl = recordlinkage.Compare(n_jobs=-1) for col in columns_to_check: if col == 'addressGoogle': compare_cl.exact(col, col) else: compare_cl.string(col, col, label=col, threshold=th, method='jarowinkler') features = compare_cl.compute(candidate_links, df, ddf) # 3 - CLASSIFICATION matches = None if classifier == "ecm": ecm = recordlinkage.ECMClassifier( init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features) matches = ecm.predict(features) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) matches = kmeans.predict(features) # 4 - COMBINE INFORMATION for left, right in matches: if not combine(df.loc[left], ddf.loc[right], thFusion, th): matches = matches.drop((left, right)) print(f"\tmatches: {len(matches)}") dfs_copy[rest] = df.copy() # 4 - DROP RIGHT ON MATCHES INDEX index_to_drop = set(matches.get_level_values(1)) print(f"\t{rr} before drop: {len(ddf.index)}") ddf.drop(index_to_drop, inplace=True) dfs_copy[rr] = ddf.copy() dfs_reduce[rr] = ddf.copy() print(f"\t{rr} after drop: {len(dfs_reduce[rr].index)}\n") del dfs_reduce[rest] final_df = pd.concat(list(dfs_copy.values())) final_df.dropna(subset=['addressGoogle'], inplace=True) final_df.drop_duplicates(inplace=True) return final_df
def collect_identical_rows_alg(self, schema_id, table_name, sorting_key, fixed_column_names, var_column_names, alg): schema_name = 'schema-' + str(schema_id) dedup_table_name = '_dedup_' + table_name + "_grouped" # TODO When user selects rows to remove, collect in table. # Afterwards when finished selecting rows of all clusters, delete those rows (UNDO) try: # Remove complete duplicates before full dedup self.remove_identical_rows( schema_id, table_name, ) # SELECT id, 'column' FROM "schema_name"."table"; data_query = 'SELECT * FROM {}.{}'.format( *_ci(schema_name, table_name)) df = pd.read_sql(data_query, con=db.engine) df = df.set_index('id') # Clean dataset ## Remove leading whitespaces #df.columns = df.columns.to_series().apply(lambda x: x.strip()) if sorting_key not in fixed_column_names: fixed_column_names.append(sorting_key) string_columns = list(df.select_dtypes(include=['object']).columns) numerical_columns = list( df.select_dtypes(include=['int64']).columns) numerical_columns.extend( list(df.select_dtypes(include=['float64']).columns)) date_columns = list( df.select_dtypes(include=['datetime64[ns]']).columns) ## Clean string values for column_name in string_columns: df[column_name] = clean(df[column_name]) # Indexation step indexer = recordlinkage.SortedNeighbourhoodIndex(on=sorting_key, window=3) pairs = indexer.index(df) # Comparison step compare_cl = recordlinkage.Compare() ## Exact matches for column_name in fixed_column_names: compare_cl.exact(column_name, column_name, label=column_name) ## Variable matches calculated using an alg (levenshtein / numerical / date) for column_name in var_column_names: if column_name in numerical_columns: compare_cl.numeric(column_name, column_name, method='linear', offset=10, scale=10) elif column_name in date_columns: compare_cl.date(column_name, column_name) elif column_name in string_columns: compare_cl.string(column_name, column_name, method=alg, threshold=0.75, label=column_name) potential_pairs = compare_cl.compute(pairs, df) # Classification step kmeans = recordlinkage.KMeansClassifier() kmeans.learn(potential_pairs) matches = kmeans.predict(potential_pairs) if len(matches) == 0: return False # Grouping step ## Group matches (A,B), (B,C) into (A,B,C) groups = self.group_matches(matches) #TODO Create table _dedup_table_groups self.create_duplicate_table(schema_id, table_name, groups) return True except Exception as e: app.logger.error( "[ERROR] Unable to generate clusters of duplicate rows from table '{}'" .format(dedup_table_name)) app.logger.exception(e) raise e
del data["Total_Score"] del data["rec_id"] del data["rec_id.1"] ###calculate known matches, then delete for classification del data['rec_num'] data.to_csv('feature_vectors_clean.csv', sep=",", encoding='utf-8') return data prepData() ####Evaluate vector scoring methodology########################################################################################### known_matches = knownMatches() missed_matches = missedMatches() false_positives = findFalsePos() pairs = ScoreRecords() print "number of comparison pairs in index:", len(pairs) print "number of matching pairs:", known_matches print "number of missed matches:", missed_matches print "number of false positives:", false_positives #supervised####################################################################################################### #unsupervised methods############################################################################################# ###k-means#################################################### data = prepData() kmeans = rl.KMeansClassifier() result_kmeans = kmeans.learn(data) print 'number of predicted pairs using K-means clustering:', len(result_kmeans) ###ECM Maximization########################################### ecm = rl.ECMClassifier() result_ecm = ecm.learn((data > 0.8).astype(int)) print 'the number of predicted pairs using ECM Maximization:', len(result_ecm)
def linkDB(df1, df2, type, classifier): # 1 - INDEXING indexer = recordlinkage.Index() if type == "sortedneighbourhood": indexer.sortedneighbourhood(left_on="0_restaurant", right_on="1_restaurant") elif type == "full": indexer.full() elif type == "block": indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle") candidate_links = indexer.index(df1, df2) test_pairs = candidate_links[0:100] #https://recordlinkage.readthedocs.io/en/latest/annotation.html """ df1.columns = df1.columns.str.replace(r'0_', '') df2.columns = df2.columns.str.replace(r'1_', '') recordlinkage.write_annotation_file( "check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF") df1 = df1.add_prefix('0_') df2 = df2.add_prefix('1_') """ annotations = recordlinkage.read_annotation_file('result.json') # 2 - COMPARISON comp = recordlinkage.Compare() comp.string('0_restaurant', '1_restaurant', threshold=0.95, method='jarowinkler', label='ristorante') comp.string('0_neighborhood', '1_neighborhood', method='jarowinkler', threshold=0.85, label='quartiere') comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle') features = comp.compute(candidate_links, df1, df2) test_features = comp.compute(test_pairs, df1, df2) # 3 - CLASSIFICATION # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised matches = [] drop1 = [] drop2 = [] if classifier == "ecm": ecm = recordlinkage.ECMClassifier(init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features, match_index=None) # Train the classifier e_matches = ecm.predict(features) for i, j in e_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) k_matches = kmeans.predict(features) for i, j in k_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) head = tuple(df1.head()) + tuple(df2.head()) matches_result = pd.DataFrame(matches) matches_result.columns = head df1t = df1.drop(drop1, axis=0) df2t = df2.drop(drop2, axis=0) result = df1t.append([df2t, matches_result]) new_index = [] for n in range(result.shape[0]): new_index.append(n) result.index = new_index # 4 - EVALUATION if classifier == "ecm": test_matches = ecm.predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) elif classifier == "kmeans": test_matches = kmeans.fit_predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) print(cm, acc) return result
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
def kmeans_classifier(features): """ Kmeans classifier """ kmeans = rl.KMeansClassifier() matches = kmeans.fit_predict(features) return matches