def test_ecm_init_jaro_1value(self): m = np.array([1.0, 0.85, .85, .81, .85, .81]) u = np.array([1.0, .10, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(init='jaro') ecm.fit(X_train) ecm.predict(X_train) with pytest.raises(KeyError): ecm.m_probs['c_1'][0] assert math.isclose(ecm.m_probs['c_1'][1], 1.0, abs_tol=0.01) assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08) assert math.isclose(ecm.u_probs['c_1'][1], 1.0, abs_tol=0.01) assert math.isclose(ecm.u_probs['c_2'][1], 0.1, abs_tol=0.05) assert math.isclose(ecm.p, 0.5, abs_tol=0.05)
def em_classifier(features): ecm = rl.ECMClassifier(binarize=0.85) matches = ecm.fit_predict(features) df_ecm_prob = pd.DataFrame(ecm.prob(features)) df_ecm_prob.columns = ['score'] return matches, df_ecm_prob
def test_binarize_input(self): m = np.array([1, .81, .85, .81, .85, .81]) u = np.array([1, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) X_train = X_train * np.random.rand(*X_train.shape) # Create the train dataset. X_test, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) X_test = X_test * np.random.rand(*X_test.shape) ecm = rl.ECMClassifier(binarize=True) ecm.fit(X_train) ecm.predict(X_test)
def test_ecm_atol_none(self): m = np.array([0.95, .81, .85, .81, .85, .81]) u = np.array([0, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(10000, 500, m=m, u=u, random_state=535, return_links=True) # Create the train dataset. X_test, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(atol=None) ecm.fit(X_train) ecm.predict(X_test) assert math.isclose(ecm.u_probs['c_1'][1], 0.0, abs_tol=1e-3) assert math.isclose(ecm.u_probs['c_1'][0], 1.0, abs_tol=1e-3)
def test_em(self): ecm = recordlinkage.ECMClassifier() ecm.learn(self.y_train.round()) ecm.predict(self.y.round()) ecm.prob(self.y.round()) assert ecm.p is not None
def test_cora(self): logger = get_logger('RL.Test.ECMClassifier.CORA') #Read Train data in dataset A & B cora = Cora() ## Extarct Features compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.candidate_links, cora.trainDataA, cora.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.ECMClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, cora.true_links, len(cora.candidate_links)) #validate the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.val_links, cora.valDataA, cora.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_val_links, len(cora.val_links)) #Test the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.test_links, cora.testDataA, cora.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_test_links, len(cora.test_links)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, cora.true_test_links) ir_metrics.log_metrics(logger)
def test_ecm_init(self): m = np.array([0.23, .81, .85, .81, .85, .81]) u = np.array([0.34, .23, .50, .23, .30, 0.13]) # Create the train dataset. X_train, true_links = binary_vectors(1000, 500, m=m, u=u, random_state=535, return_links=True) ecm = rl.ECMClassifier(init='random') ecm.fit(X_train) ecm.predict(X_train) print(ecm.m_probs) print(ecm.log_m_probs) print(ecm.u_probs) print(ecm.log_u_probs) assert math.isclose(ecm.m_probs['c_2'][1], 0.85, abs_tol=0.08)
n_pairs = 50000 n_matches = 7000 m_simulate = np.array([.94, .81, .85, .90, .99, .70, .56, .92]) u_simulate = np.array([.19, .23, .50, .11, .20, .14, .50, .09]) # Create the dataset and return the true links. X_data, links_true = binary_vectors( n_pairs, # the number of candidate links n_matches, # the number of true links m=m_simulate, # the m probabilities u=u_simulate, # the u probabilities random_state=535, # set seed return_links=True) # return true links # Initialise the Expectation-Conditional Maximisation classifier. cl = rl.ECMClassifier() cl.fit(X_data) # Print the parameters that are trained (m, u and p). Note that the estimates # are very good. print("p probability P(Match):", cl.p) print("m probabilities P(x_i=1|Match):", cl.m_probs) print("u probabilities P(x_i=1|Non-Match):", cl.u_probs) print("log m probabilities P(x_i=1|Match):", cl.log_m_probs) print("log u probabilities P(x_i=1|Non-Match):", cl.log_u_probs) print("log weights of features:", cl.log_weights) print("weights of features:", cl.weights) # evaluate the model links_pred = cl.predict(X_data) print("Predicted number of links:", len(links_pred))
def test_ecm(self): logger = get_logger('RL.Test.ECMClassifier.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.ECMClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("ECM weights: %s", str(logrg.weights)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) start_time = timeit.default_timer() entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] logger.info("Entities built in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] logger.info("Result prob in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] logger.info("true_links in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() export_result_prob(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, entitiesB) logger.info("Result prob EXPORTED in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) logger.info("FP & FN EXPORTED in %s", str(timeit.default_timer() - start_time)) result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'ECM', 'census', 'ecm', entitiesA, result_feature_mapping, entitiesB) logger.info("Exported Human Readable Results")
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
def test_ecm_predict(self): ecm = rl.ECMClassifier() ecm.fit(self.X_train.round()) ecm.predict(self.X_test)
featuresbcd3 = features[features['totallinks'] == 3] featuresbcd3 = featuresbcd3[featuresbcd3['Company'] == 1] featuresbcd3 = featuresbcd3[featuresbcd3['Corporate Family'] == 1] featuresbcd3 = featuresbcd3[featuresbcd3['Phone Number'] == 1] featuresabcd = features[features['totallinks'] == 4] featuresabc3.to_excel(r'/Users/Adam/Desktop/featuresabc.xlsx', sheet_name='featuresabc', index=False) featuresabd3.to_excel(r'/Users/Adam/Desktop/featuresabd.xlsx', sheet_name='featuresabd', index=False) featuresbcd3.to_excel(r'/Users/Adam/Desktop/featuresbcd.xlsx', sheet_name='featuresbcd', index=False) featuresabcd.to_excel(r'/Users/Adam/Desktop/featuresabcd.xlsx', sheet_name='featuresabcd', index=False) # %% ecm = recordlinkage.ECMClassifier() matchdf = ecm.fit_predict(features) matchdffinal = matchdf.to_frame(index=False) matchdffinal.to_excel(r'/Users/Adam/Desktop/HEALTHCAREMATCHES.xlsx', sheet_name='HEALTHCAREMATCHES', index=False)
def test_ecm_probs(self): ecm = rl.ECMClassifier() ecm.fit(self.X_train.round()) assert (ecm.p <= 1.0) & (ecm.p >= 0.0)
# See the outcome df_comparison_results.head() df_comparison_results[df_comparison_results.sum(axis=1) > 3].head() # Let us use unsupervised technique on all features except on which blocking is done #refined data list_features = ['SUBURB', 'STATE', 'SURNAME', 'DATE_OF_BIRTH', 'ADDRESS_1'] df_comparison_results = df_comparison_results[list_features] df_comparison_results[list_features] = df_comparison_results[ list_features].apply(lambda x: x.astype(int)) df_comparison_results.head() # Build model object classifier = recordlinkage.ECMClassifier() #train classifier.fit(df_comparison_results) #Predict pred = classifier.predict(df_comparison_results) # Convert to Df for readability df = pd.DataFrame([pred]).transpose() df.head() del (train, list_text_data, list_int_data, indexer, candidate_links, compare_rl, df_comparison_results, list_features, classifier, pred, df) #%% Entity resolution /Deduplication from two table # https://recordlinkage.readthedocs.io/en/latest/notebooks/link_two_dataframes.html
def linkDB(df1, df2, type, classifier): # 1 - INDEXING indexer = recordlinkage.Index() if type == "sortedneighbourhood": indexer.sortedneighbourhood(left_on="0_restaurant", right_on="1_restaurant") elif type == "full": indexer.full() elif type == "block": indexer.block(left_on="0_addressGoogle", right_on="1_addressGoogle") candidate_links = indexer.index(df1, df2) test_pairs = candidate_links[0:100] #https://recordlinkage.readthedocs.io/en/latest/annotation.html """ df1.columns = df1.columns.str.replace(r'0_', '') df2.columns = df2.columns.str.replace(r'1_', '') recordlinkage.write_annotation_file( "check_matches.json", candidate_links[0:100], df1, df2, dataset_a_name="firstDF", dataset_b_name="secondDF") df1 = df1.add_prefix('0_') df2 = df2.add_prefix('1_') """ annotations = recordlinkage.read_annotation_file('result.json') # 2 - COMPARISON comp = recordlinkage.Compare() comp.string('0_restaurant', '1_restaurant', threshold=0.95, method='jarowinkler', label='ristorante') comp.string('0_neighborhood', '1_neighborhood', method='jarowinkler', threshold=0.85, label='quartiere') comp.exact('0_addressGoogle', '1_addressGoogle', label='indirizzoGoogle') features = comp.compute(candidate_links, df1, df2) test_features = comp.compute(test_pairs, df1, df2) # 3 - CLASSIFICATION # https://recordlinkage.readthedocs.io/en/latest/ref-classifiers.html#unsupervised matches = [] drop1 = [] drop2 = [] if classifier == "ecm": ecm = recordlinkage.ECMClassifier(init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features, match_index=None) # Train the classifier e_matches = ecm.predict(features) for i, j in e_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) k_matches = kmeans.predict(features) for i, j in k_matches: if i not in drop1: drop1.append(i) if j not in drop2: drop2.append(j) record_1 = df1.loc[i] record_2 = df2.loc[j] record = tuple(record_1) + tuple(record_2) matches.append(record) head = tuple(df1.head()) + tuple(df2.head()) matches_result = pd.DataFrame(matches) matches_result.columns = head df1t = df1.drop(drop1, axis=0) df2t = df2.drop(drop2, axis=0) result = df1t.append([df2t, matches_result]) new_index = [] for n in range(result.shape[0]): new_index.append(n) result.index = new_index # 4 - EVALUATION if classifier == "ecm": test_matches = ecm.predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) elif classifier == "kmeans": test_matches = kmeans.fit_predict(test_features) cm = recordlinkage.confusion_matrix(annotations.links, test_matches, total=100) acc = recordlinkage.accuracy(annotations.links, test_matches, total=100) print(cm, acc) return result
import recordlinkage as rl from recordlinkage.datasets import load_krebsregister krebs_X, krebs_true_links = load_krebsregister(missing_values=0) print(krebs_true_links) # Train the classifier ecm = rl.ECMClassifier(binarize=0.8) result_ecm = ecm.fit_predict(krebs_X) len(result_ecm) print(rl.confusion_matrix(krebs_true_links, result_ecm, len(krebs_X))) # The F-score for this classification is print(rl.fscore(krebs_true_links, result_ecm)) print(ecm.log_weights)
def link_reduce(from_rest: str, dfs: dict, window: int, th: float, classifier: str, thFusion: float) -> dict: dfs_copy = {from_rest: dfs[from_rest]} dfs_reduce = dfs.copy() # Make copy of dfs with from_rest moved on top for rest, df in dfs.items(): if rest == from_rest: continue else: dfs_copy[rest] = df.copy() for rest, df in dfs_copy.items(): for rr, ddf in dfs_reduce.items(): if rr == rest: continue else: columns_to_check = ['restaurant'] print(f"{rest} -> {rr}") if df['addressGoogle'].isnull().sum() != len(df['addressGoogle']) and ddf['addressGoogle'].isnull().sum() != len(ddf['addressGoogle']): columns_to_check.append('addressGoogle') if df['neighborhood'].isnull().sum() != len(df['neighborhood']) and ddf['neighborhood'].isnull().sum() != len(ddf['neighborhood']): columns_to_check.append('neighborhood') #print(f"\tcheck: {columns_to_check}") indexer = recordlinkage.Index() # 1 - INDEXING for col in columns_to_check: indexer.sortedneighbourhood( left_on=col, right_on=col, window=window) candidate_links = indexer.index(df, ddf) # 2 - COMPARISON compare_cl = recordlinkage.Compare(n_jobs=-1) for col in columns_to_check: if col == 'addressGoogle': compare_cl.exact(col, col) else: compare_cl.string(col, col, label=col, threshold=th, method='jarowinkler') features = compare_cl.compute(candidate_links, df, ddf) # 3 - CLASSIFICATION matches = None if classifier == "ecm": ecm = recordlinkage.ECMClassifier( init='jaro', binarize=None, max_iter=100, atol=0.0001, use_col_names=True) ecm.fit_predict(features) matches = ecm.predict(features) elif classifier == "kmeans": kmeans = recordlinkage.KMeansClassifier() kmeans.fit_predict(features) matches = kmeans.predict(features) # 4 - COMBINE INFORMATION for left, right in matches: if not combine(df.loc[left], ddf.loc[right], thFusion, th): matches = matches.drop((left, right)) print(f"\tmatches: {len(matches)}") dfs_copy[rest] = df.copy() # 4 - DROP RIGHT ON MATCHES INDEX index_to_drop = set(matches.get_level_values(1)) print(f"\t{rr} before drop: {len(ddf.index)}") ddf.drop(index_to_drop, inplace=True) dfs_copy[rr] = ddf.copy() dfs_reduce[rr] = ddf.copy() print(f"\t{rr} after drop: {len(dfs_reduce[rr].index)}\n") del dfs_reduce[rest] final_df = pd.concat(list(dfs_copy.values())) final_df.dropna(subset=['addressGoogle'], inplace=True) final_df.drop_duplicates(inplace=True) return final_df
del data["Total_Score"] del data["rec_id"] del data["rec_id.1"] ###calculate known matches, then delete for classification del data['rec_num'] data.to_csv('feature_vectors_clean.csv', sep=",", encoding='utf-8') return data prepData() ####Evaluate vector scoring methodology########################################################################################### known_matches = knownMatches() missed_matches = missedMatches() false_positives = findFalsePos() pairs = ScoreRecords() print "number of comparison pairs in index:", len(pairs) print "number of matching pairs:", known_matches print "number of missed matches:", missed_matches print "number of false positives:", false_positives #supervised####################################################################################################### #unsupervised methods############################################################################################# ###k-means#################################################### data = prepData() kmeans = rl.KMeansClassifier() result_kmeans = kmeans.learn(data) print 'number of predicted pairs using K-means clustering:', len(result_kmeans) ###ECM Maximization########################################### ecm = rl.ECMClassifier() result_ecm = ecm.learn((data > 0.8).astype(int)) print 'the number of predicted pairs using ECM Maximization:', len(result_ecm)