def test_logistic_regression_manual(self): """ Test the LogisticRegressionClassifier in case of setting the parameters manually. """ # Make random test data. numpy.random.seed(535) manual_coefficients = numpy.random.randn(self.y_train.shape[1]) manual_intercept = numpy.random.rand() # Initialize the LogisticRegressionClassifier logis = recordlinkage.LogisticRegressionClassifier() # Check if the cofficients and intercapt are None at this point assert logis.coefficients is None assert logis.intercept is None # Set the parameters coefficients and intercept logis.coefficients = manual_coefficients logis.intercept = manual_intercept # Perform the prediction logis.predict(self.y) # Train the classifier after manula setting logis.learn(self.y_train, self.matches_index) logis.predict(self.y) lc = numpy.array(logis.coefficients) assert lc.shape == (self.y_train.shape[1], ) assert isinstance(logis.intercept, (float))
def logreg_classifier(features, links_true, train_size=0.2, cv=None): """ Logistic Regression classifier function""" logreg = rl.LogisticRegressionClassifier() if cv is None: golden_match_index = features.index & links_true.index train_index = int(len(features) * train_size) #train model logreg.fit(features[0:train_index], golden_match_index) # Predict the match status for all record pairs matches = logreg.predict(features) df_logreg_prob = pd.DataFrame(logreg.prob(features)) df_logreg_prob.columns = ['score'] else: df_results = cross_val_predict(logreg, features, links_true, cv, method='predict') matches = df_results.index df_logreg_prob = cross_val_predict(logreg, features, links_true, cv, method='predict_proba') return matches, df_logreg_prob
def test_logistic_regression_manual(self): # Make random test data. np.random.seed(535) manual_coefficients = np.random.randn(self.X_train.shape[1]) manual_intercept = np.random.rand() # Initialize the LogisticRegressionClassifier logis = rl.LogisticRegressionClassifier() assert not hasattr(logis, 'coefficients') assert not hasattr(logis, 'intercept') # Set the parameters coefficients and intercept logis.coefficients = manual_coefficients logis.intercept = manual_intercept # Perform the prediction logis.predict(self.X_test) # Train the classifier after manual setting logis.fit(self.X_train, self.y_train) logis.predict(self.X_test) lc = logis.coefficients assert lc.shape == (self.X_train.shape[1], ) assert isinstance(logis.intercept, (float))
def test_logistic_regression_basic(self): logis = rl.LogisticRegressionClassifier() # Test the basics logis.fit(self.X_train, self.y_train) logis.predict(self.X_test) logis.prob(self.X_train)
def ProcessData(patientDataList, fetchedHospitalData): # Read from the directory filelist = pd.read_csv( '/home/bizzzzzzzzzzzzu/Music/MedicalPortal/MedicPortal DataProcessing/FetchedData/' + fetchedHospitalData) # Indexation step indexer = p.Index() indexer.add(Block(left_on='fatherName', right_on='fatherName')) candidate_links = indexer.index(patientDataList, filelist) # print((candidate_links)) # Comparison step compare_cl = p.Compare() # compare_cl.exact('_id','_id',label='_id') compare_cl.exact('name', 'name', label='name') compare_cl.exact('fatherName', 'fatherName', label='fatherName') compare_cl.exact('grandFatherName', 'grandFatherName', label='grandFatherName') compare_cl.exact('gender', 'gender', label='gender') compare_cl.exact('dateOfBirth', 'dateOfBirth', label='dateOfBirth') compare_cl.exact('dayOfBirth', 'dayOfBirth', label='dayOfBirth') compare_cl.exact('monthOfBirth', 'monthOfBirth', label='monthOfBirth') compare_cl.exact('yearOfBirth', 'yearOfBirth', label='yearOfBirth') compare_cl.exact('age', 'age', label='age') # compare_cl.exact('address','address',label='address') # compare_cl.exact('phoneNumber','phoneNumber',label='phoneNumber') features = compare_cl.compute(candidate_links, patientDataList, filelist) if features.empty: return None else: # Classification step ''' Use the KMeans Classifier This classifier is equivalent to the Unsupervised record linkage approach ''' # # classifier = p.LogisticRegressionClassifier(coefficients=coefficients,intercept=intercept) classifier = p.LogisticRegressionClassifier() classifier.fit(golden_pairs, golden_matches_index) links = classifier.predict(features) return links
def test_logistic_regression_basic(self): """ Test the LogisticRegressionClassifier by training it, predict on a dataset and get the probabilities. """ logis = recordlinkage.LogisticRegressionClassifier() # Test the basics logis.learn(self.y_train, self.matches_index) logis.predict(self.y) logis.prob(self.y)
def test_cora(self): logger = get_logger('RL.Test.LogisticRegression.CORA') #Read Train data in dataset A & B cora = Cora() ## Extarct Features compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.candidate_links, cora.trainDataA, cora.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train Logistic Regression Classifier logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, cora.true_links) result = logrg.predict(features) log_quality_results(logger, result, cora.true_links, len(cora.candidate_links)) #Validate the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.val_links, cora.valDataA, cora.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_val_links, len(cora.val_links)) #Test the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.test_links, cora.testDataA, cora.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_test_links, len(cora.test_links)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, cora.true_test_links) ir_metrics.log_metrics(logger)
def init_model(classifier: str, num_features: int, **kwargs): if classifier is keys.NAIVE_BAYES: # add `binarize` threshold if not already specified kwargs = {**constants.NAIVE_BAYES_PARAMS, **kwargs} model = rl.NaiveBayesClassifier(**kwargs) elif classifier is keys.LOGISTIC_REGRESSION: kwargs = {**constants.LOGISTIC_REGRESSION_PARAMS, **kwargs} model = rl.LogisticRegressionClassifier(**kwargs) elif classifier is keys.LINEAR_SVM: kwargs = {**constants.LINEAR_SVM_PARAMS, **kwargs} model = rl.SVMClassifier(**kwargs) elif classifier is keys.SVM: model = classifiers.SVCClassifier(**kwargs) elif classifier is keys.RANDOM_FOREST: model = classifiers.RandomForest(**kwargs) elif classifier is keys.SINGLE_LAYER_PERCEPTRON: model = classifiers.SingleLayerPerceptron(num_features, **kwargs) elif classifier is keys.MULTI_LAYER_PERCEPTRON: model = classifiers.MultiLayerPerceptron(num_features, **kwargs) elif classifier is keys.VOTING_CLASSIFIER: model = classifiers.VotingClassifier(num_features, **kwargs) elif classifier is keys.GATED_CLASSIFIER: model = classifiers.GatedEnsembleClassifier(num_features, **kwargs) elif classifier is keys.STACKED_CLASSIFIER: model = classifiers.StackedEnsembleClassifier(num_features, **kwargs) else: err_msg = ( f'Classifier not supported: {classifier}. ' f'It should be one of {set(constants.CLASSIFIERS)}' ) LOGGER.critical(err_msg) raise ValueError(err_msg) LOGGER.info('Model initialized: %s', model) return model
def _test_logistic_transh(self, dataset, params): """Note: Zero aligned pairs are returned, require fixation.""" model = dataset() logger = get_logger('RL.Test.LogisticTransH.' + str(model)) entity, relation, triples, entity_pairs, true_pairs = model.get_er_model( ) transh = TransH(entity, relation, triples, entity_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() ent_embeddings = [ np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0]) ] trainDataA = pd.DataFrame(data=ent_embeddings) trainDataB = pd.DataFrame(data=ent_embeddings) compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i), method='gauss') candidate_links = pd.MultiIndex.from_tuples(entity_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Features %s", str(features.describe())) logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, true_pairs) result = logrg.predict(features) log_quality_results(logger, result, true_pairs, len(entity_pairs)) prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) ir_metrics.log_metrics(logger, params)
def evalution(X_data, links_true): # 这里用逻辑回归分类器做分类, cl = recordlinkage.LogisticRegressionClassifier() cl.fit(X_data, links_true) # 用得到的模型做预测 links_pred = cl.predict(X_data) print("links_pred:{}".format(links_pred.shape)) # 输出混淆矩阵,confusion_matrix cm = recordlinkage.confusion_matrix(links_true, links_pred, total=len(X_data)) print("Confusion matrix:\n", cm) # compute the F-score for this classification fscore = recordlinkage.fscore(cm) print('fscore', fscore) # compute recall for this classification recall = recordlinkage.recall(links_true, links_pred) print('recall', recall) # compute precision for this classification precision = recordlinkage.precision(links_true, links_pred) print('precision', precision)
def test_census_new(self): c = Census() graph = Graph_VEG(Census) logger = get_logger("RL.Test.LogisticRLTransE.Census") logger.info("values for name : %s", str(graph.relation_value_map[graph.relation[1]][:10])) logger.info("relation: %s", str(graph.relation)) logger.info("train_triples: %s", str(graph.train_triples[:10])) logger.info("set train_triples size %d", len(set(graph.train_triples))) params = self.get_default_params() transe = RLTransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss, val_loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f val_loss: %f", loss, val_loss) value_embeddings = transe.get_val_embeddings() relation_embeddings = transe.get_rel_embeddings() #Map of feilds in census dataFrame to VEG relations. field_relation_map = { c.field_map[CensusFields.FIRST_NAME]: "name", c.field_map[CensusFields.SURNAME_1]: "surname", c.field_map[CensusFields.SURNAME_2]: "surname2", c.field_map[CensusFields.YOB]: "yob", c.field_map[CensusFields.CIVIL_STATUS]: "civil", c.field_map[CensusFields.OCCUPATION]: "occupation", c.field_map[CensusFields.RELATION]: "relation" } missing_values = [] train_features = [] #Size samples*(dimension*rel_count) test_features = [] for (candidate_links, dataA, dataB, features) in \ [(c.candidate_links, c.trainDataA, c.trainDataB, train_features), (c.test_links, c.testDataA, c.testDataB, test_features)]: for (a, b) in candidate_links: row_a = dataA.loc[a] row_b = dataB.loc[b] distance = [] for f in field_relation_map: val_a = row_a[f] val_b = row_b[f] if val_a != val_b: rel = field_relation_map[f] try: val_index_a = graph.relation_value_map[rel].index( val_a) except ValueError: missing_values.append(val_a) distance.extend([1.0] * params['dimension']) continue try: val_index_b = graph.relation_value_map[rel].index( val_b) except ValueError: missing_values.append(val_b) distance.extend([1.0] * params['dimension']) continue rel_index = graph.relation.index(field_relation_map[f]) distance.extend(value_embeddings[rel][val_index_a] + \ relation_embeddings[rel_index] - value_embeddings[rel][val_index_b]) features.append(pd.Series(distance).rename((a, b))) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in c.true_test_links) logger.info("No. of missing values: %d", len(missing_values)) logger.info("Unique No. of missing values: %d", len(set(missing_values))) train_features = pd.DataFrame(data=train_features).fillna(1) test_features = pd.DataFrame(data=test_features).fillna(1) logger.info("Shape of Train features: %s", str(train_features.shape)) logger.info("Shape of Test features: %s", str(test_features.shape)) #Train Logistic Regression Model logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(train_features, c.true_links) result = logrg.predict(train_features) result = pd.MultiIndex.from_tuples(result.to_series()) log_quality_results(logger, result, c.true_links, len(c.candidate_links), params) #Test Classifier result = logrg.predict(test_features) result = pd.MultiIndex.from_tuples(result.to_series()) log_quality_results(logger, result, c.true_test_links, len(c.test_links), params) """ Todo: Export Embeddings and probabilities. try: entities = ["value\trelation"] for r in graph.relation_value_map: for v in graph.relation_value_map[r]: entities.append("\t".join([v,r])) embeddings = [] for rel in value_embeddings: val_count = len(graph.relation_value_map[rel]) embeddings.extend(value_embeddings[rel][:val_count]) #Write Embeddings to file export_embeddings('veg', str(c), 'LogisticRLTransE', entities, embeddings) except Exception as e: logger.error("Failed to export embeddings") logger.error(e) export_result_prob(Census, 'veg', str(c), 'RLTransE', graph.values, result_prob, c.true_test_links) """ prob_series = logrg.prob(test_features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(c.test_links[i][0], c.test_links[i][1], prob[i]) for i in range(0, len(prob))] #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, c.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session()
def create_and_train_logistic_regression(): """ Creates and trains a KMeans Classifier """ return train_supervised_classifier(rl.LogisticRegressionClassifier())
matches = data[0:316] matches = matches[['sku_1', 'sku_2']] matches = pandas.MultiIndex.from_frame(matches) data = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2']) golden_pairs = data.sample(frac=1) golden_pairs = golden_pairs[0:5000] golden_matches_index = golden_pairs.index & matches print(golden_matches_index) data_2 = pandas.read_csv('/home/jake/Documents/matching/functions/comparison_index.csv', index_col=['sku_1', 'sku_2']) logreg = recordlinkage.LogisticRegressionClassifier() logreg.fit(golden_pairs, golden_matches_index) print ("Intercept: ", logreg.intercept) print ("Coefficients: ", logreg.coefficients) result_logreg = logreg.predict(data_2) print(len(result_logreg)) print(result_logreg) print(recordlinkage.confusion_matrix(matches, result_logreg, len(data_2))) print(recordlinkage.fscore(matches, result_logreg)) coefficients = [2, -0.08400654, -0.41432631, -0.12138752, -0.31617086, -0.42389099, -0.33185166, 0.02173983, 0]
match_can_df = lnk.adjust_scores(match_can_df, 'Phone', 0.6) lnk.get_true_match_vals(match_can_df, slp, tm) test_size = 0.3 random_state = 456 train, test = train_test_split(match_can_df, stratify=match_can_df.Match, test_size=test_size, random_state=random_state) train_matches_index = train[train.Match == 1] test_matches_index = test[test.Match == 1] train.drop(columns='Match', inplace=True) test.drop(columns='Match', inplace=True) lr_all = recordlinkage.LogisticRegressionClassifier() lr_all.fit_predict(train, train_matches_index.index) test = pm.get_predictions(test, lr) lnk.get_true_match_vals(test, slp, tm) pm.get_cf_mat(test) test = pm.add_col_from_df(test, slp, 'country') pm.get_country_roc_curves(test)
comparer.add(String('address_2', 'address_2', threshold=0.85, label='address_2')) features = comparer.compute(candidate_links, dfA) print('feature shape', features.shape) # use the Logistic Regression Classifier # this classifier is equivalent to the deterministic record linkage approach intercept = -9.5 coefficients = [2.0, 3.0, 7.0, 6.0, 2.5, 5.0, 5.5] print('Deterministic classifier') print('intercept', intercept) print('coefficients', coefficients) logreg = rl.LogisticRegressionClassifier( coefficients=coefficients, intercept=intercept) links = logreg.predict(features) print(len(links), 'links/matches') # return the confusion matrix conf_logreg = rl.confusion_matrix(true_links, links, len(candidate_links)) print('confusion matrix') print(conf_logreg) # compute the F-score for this classification fscore = rl.fscore(conf_logreg) print('fscore', fscore) recall = rl.recall(true_links, links) print('recall', recall) precision = rl.precision(true_links, links)
def test_probs(self): cl = recordlinkage.LogisticRegressionClassifier() with pytest.raises(ValueError): cl.prob(self.y, return_type='unknown_return_type')
def get_matches(locu_train_path, foursquare_train_path, matches_train_path, locu_test_path, foursquare_test_path): four_train = pd.read_json(foursquare_train_path) locu_train = pd.read_json(locu_train_path) four_test = pd.read_json(foursquare_test_path) locu_test = pd.read_json(locu_test_path) matches_train = pd.read_csv(matches_train_path) # visualize missing data # msno.matrix(four_train) # msno.matrix(locu_train) # msno.matrix(four_test) # msno.matrix(locu_test) locu_train, four_train = preprocess(locu_train, four_train) locu_test, four_test = preprocess(locu_test, four_test) matches_train = preprocess_matches(matches_train) candidate_pairs = index_pairs(locu_train, four_train) test_candidate_pairs = index_pairs(locu_test, four_test) # print (len(locu_train), len(four_train), len(candidate_pairs)) # print (len(locu_test), len(four_test), len(test_candidate_pairs)) features = compare_strings(locu_train, four_train, candidate_pairs) test_features = compare_strings(locu_test, four_test, test_candidate_pairs) # features = features.loc[features['street_address'] > .1] # features = features.loc[features['name'] > .1] train_pairs, train_matches_index, all_matches_index = traintestsplit( features, matches_train) # Train Logistic Regression classifier logreg = recordlinkage.LogisticRegressionClassifier() logreg.learn(train_pairs, train_matches_index) # print ("LogReg Intercept: ", logreg.intercept) # print ("LogReg Coefficients: ", logreg.coefficients) # Train SVM classifier svm = recordlinkage.SVMClassifier() svm.learn(train_pairs, train_matches_index) # Predict on training data with both classifiers svm_results_index = predict(features, svm) logreg_results_index = predict(features, logreg) # To view pairs # features.index = features.index.rename(['locu_id', 'foursquare_id']) # train_matches = features.loc[svm_results_index] # train_matches # Training results svm_confn_matrix = recordlinkage.confusion_matrix(all_matches_index, svm_results_index, len(features)) # print("SVM Confusion Matrix: ", svm_confn_matrix) # print("SVM Precision: ", recordlinkage.precision(svm_confn_matrix)) # print("SVM Recall: ", recordlinkage.recall(svm_confn_matrix)) # print("SVM Accuracy: ", recordlinkage.accuracy(svm_confn_matrix)) # print("SVM F1 Score: ", recordlinkage.fscore(svm_confn_matrix)) logreg_confn_matrix = recordlinkage.confusion_matrix( all_matches_index, logreg_results_index, len(features)) # print("Logistic Regression Confusion Matrix: ", logreg_confn_matrix) # print("Logistic Regression Precision: ", recordlinkage.precision(logreg_confn_matrix)) # print("Logistic Regression Recall: ", recordlinkage.recall(logreg_confn_matrix)) # print("Logistic Regression Accuracy: ", recordlinkage.accuracy(logreg_confn_matrix)) # print("Logistic Regression F1 Score: ", recordlinkage.fscore(logreg_confn_matrix)) # Predict on test data with SVM test_results_index = predict(test_features, svm) # Format and write to CSV test_features.index = test_features.index.rename( ['locu_id', 'foursquare_id']) test_match_pairs = test_features.loc[test_results_index] matches_test = test_match_pairs.drop(test_match_pairs.columns[::], axis=1) # matches_test matches_test.to_csv('matches_test.csv') # create a dataframe for both fourquare and locu of pairs that get matched test_tuples = list(matches_test.index) test_locu_index = [i[0] for i in test_tuples] test_four_index = [i[1] for i in test_tuples] test_locu_matches = locu_test.loc[test_locu_index] test_four_matches = four_test.loc[test_four_index] # for viewing full match dataset temp = matches_test.reset_index().join(test_four_matches, on=['foursquare_id']) test_match_pairs = temp.join(test_locu_matches, on=['locu_id'], lsuffix='_foursquare', rsuffix='_locu').set_index( matches_test.index.names) cols = np.array(test_match_pairs.columns.tolist()) order = [0, 7, 1, 8, 2, 9, 3, 10, 4, 11, 5, 12, 6, 13] cols = list(cols[order]) test_matches_reordered = test_match_pairs[cols] # display(test_matches_reordered) # print("Successfully wrote results to matches_test.csv") return
def _test_logistic_transh_erer(self, dataset, params): model = dataset() logger = get_logger('RL.Test.erer.LogisticTransH.ERER.' + str(model)) entA, entB, relA, relB, triA, triB, entity_pairs, prior_pairs, true_pairs = model.get_erer_model( ) self.assertTrue(all([(tp in entity_pairs) for tp in true_pairs])) #Generate embeddings for datasetA transh = TransH(entA, relA, triA, prior_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddingsA = transh.get_ent_embeddings() transh.close_tf_session() del transh #Generate embeddings for datasetB transh = TransH(entB, relB, triB, entity_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddingsB = transh.get_ent_embeddings() transh.close_tf_session() ent_embeddingsA = [ np.array(ent_embeddingsA[i]) for i in range(ent_embeddingsA.shape[0]) ] ent_embeddingsB = [ np.array(ent_embeddingsB[i]) for i in range(ent_embeddingsB.shape[0]) ] trainDataA = pd.DataFrame(data=ent_embeddingsA) trainDataB = pd.DataFrame(data=ent_embeddingsB) #Define comparision Class compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i)) #method='exp') #sample negative pairs train_pairs = [] tuple_pp = set(map(tuple, prior_pairs)) logger.info("Number of prior_pairs: %d", len(prior_pairs)) for e1, e2 in prior_pairs: train_pairs.append((e1, e2)) while True: neg_e2 = random.choice(xrange(0, len(entB))) if neg_e2 == e2 or (e1, neg_e2) in tuple_pp: continue else: train_pairs.append((e1, neg_e2)) break logger.info("Number of Train Pairs: %d", len(train_pairs)) candidate_links = pd.MultiIndex.from_tuples(train_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Train Features %s", str(features.describe())) #Train Logistic Regression Model logrg = recordlinkage.LogisticRegressionClassifier() candidate_links = pd.MultiIndex.from_tuples(prior_pairs) logrg.fit(features, candidate_links) #Test Classifier compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i)) candidate_links = pd.MultiIndex.from_tuples(entity_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, true_pairs, len(entity_pairs)) prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) ir_metrics.log_metrics(logger, params, params) #Export results export_embeddings('erer', str(model), 'LogTransH', entA, ent_embeddingsA) export_embeddings('erer', str(model), 'LogTransH', entB, ent_embeddingsB) export_result_prob(dataset, 'erer', str(model), 'LogTransH', entA, result_prob, true_pairs, entB)
def run_experiment(win_len, preproc, comparison_variant, run_only=None): # window length if win_len == 0: index_description = "block" indexer = recordlinkage.BlockIndex('year') elif win_len > 0: index_description = f"nb{win_len}" indexer = recordlinkage.SortedNeighbourhoodIndex('year', window=win_len) else: raise ValueError(f"Invalid window length {win_len}") pairs_train = indexer.index(dataDBLP_train, dataScholar_train) pairs_test = indexer.index(dataDBLP_test, dataScholar_test) if debug: print(f"Number of candidates (index={index_description}):") print(f"{len(pairs_train)} (train), {len(pairs_test)} (test)") # preprocessing if preproc == 0: print("No preprocesing") field_suffix = "" preproc_description = "none" elif preproc == 1: print("Cleaned fields") field_suffix = "_clean" preproc_description = "clean" elif preproc == 2: print("Soundex encoding") field_suffix = "_soundex" preproc_description = "soundex" elif preproc == 3: print("Nysiis encoding") field_suffix = "_nysiis" preproc_description = "nysiis" elif preproc == 4: print("Metaphone encoding") field_suffix = "_metaphone" preproc_description = "metaphone" elif preproc == 5: print("Match-rating encoding") field_suffix = "_match_rating" preproc_description = "match_rating" else: raise ValueError(f"Unknown preprocessing variant {preproc}") print(f"Preprocessing used: {preproc_description}") # comparator comp = recordlinkage.Compare() if comparison_variant == 0: comp_description = "exact" comp.add(compare.Exact('title' + field_suffix, 'title' + field_suffix)) comp.add( compare.Exact('authors' + field_suffix, 'authors' + field_suffix)) comp.add(compare.Exact('venue' + field_suffix, 'venue' + field_suffix)) elif comparison_variant == 1: comp_description = "levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='levenshtein')) elif comparison_variant == 2: comp_description = "damerau_levenshtein" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='damerau_levenshtein')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='damerau_levenshtein')) elif comparison_variant == 3: comp_description = "jaro" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jaro')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jaro')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jaro')) elif comparison_variant == 4: comp_description = "jarowinkler" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='jarowinkler')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='jarowinkler')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='jarowinkler')) elif comparison_variant == 5: comp_description = "qgram" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='qgram')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='qgram')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='qgram')) elif comparison_variant == 6: comp_description = "cosine" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='cosine')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='cosine')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='cosine')) elif comparison_variant == 7: comp_description = "smith_waterman" comp.add( compare.String('title' + field_suffix, 'title' + field_suffix, method='smith_waterman')) comp.add( compare.String('authors' + field_suffix, 'authors' + field_suffix, method='smith_waterman')) comp.add( compare.String('venue' + field_suffix, 'venue' + field_suffix, method='smith_waterman')) else: raise ValueError(f"Unknown comparison variant {comparison_variant}") print(f"String comparison: {comp_description}") print("Start compare for training data set") start = time.time() result_train = comp.compute(pairs_train, dataDBLP_train, dataScholar_train) print("Compare on training data took %.2fs" % (time.time() - start)) print("Start compare for test data set") start = time.time() result_test = comp.compute(pairs_test, dataDBLP_test, dataScholar_test) # save time compare for evaluation time_compare = time.time() - start print("Compare on test data took %.2fs" % (time_compare)) matches = [] for classifier_description in ['logreg', 'bayes', 'svm', 'kmeans', 'ecm']: # skip others if only one classifier is requested if run_only is not None and run_only != classifier_description: continue if classifier_description == 'logreg': print("Logistic Regression classifier") classifier = recordlinkage.LogisticRegressionClassifier() supervised = True elif classifier_description == 'bayes': print("Naive Bayes classifier") classifier = recordlinkage.NaiveBayesClassifier(binarize=0.75) supervised = True elif classifier_description == 'svm': print("Support Vector Machine classifier") classifier = recordlinkage.SVMClassifier() supervised = True elif classifier_description == 'kmeans': print("KMeans classifier") classifier = recordlinkage.KMeansClassifier() supervised = False elif classifier_description == 'ecm': print("ECM classifier") classifier = recordlinkage.ECMClassifier(binarize=0.75) supervised = False else: raise ValueError( f"Unknown classifier variant {classifier_description}") if supervised: start = time.time() classifier.fit(result_train, links_train) time_train = time.time() - start start = time.time() match = classifier.predict(result_test) time_classify = time.time() - start else: start = time.time() match = classifier.fit_predict(result_test) time_classify = time.time() - start time_train = 0 matches.append( (index_description, preproc_description, comp_description, classifier_description, match, 1000 * time_compare, 1000 * time_train, 1000 * time_classify)) if debug: print("%d matches" % len(match)) print_experiment_evaluation( match, "-".join((index_description, preproc_description, comp_description))) return matches
def test_logistic(self): logger = get_logger('RL.Test.LogisticRegression.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, census.true_links) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("logrg coefficients: %s", str(logrg.coefficients)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] export_result_prob(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, entitiesB) result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) weights = logrg.coefficients result = [ (e1, e2, [str("%.2f" % (float(d * w) / sum(weights))) for w in weights], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_feature_mapping, entitiesB)