def _test_transh(self, dataset, params): graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.TransH.' + str(model)) transh = TransH(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[i][0]], ent_embeddings[graph.entity_pairs[i][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'TransH', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: logger.info("MAX FSCORE: %f AT : %f", max_fscore, optimal_threshold) result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) params['threshold'] = optimal_threshold log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) export_false_negatives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) export_false_positives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) p_at_1 = ir_metrics.log_metrics(logger, params) transh.close_tf_session() return (max_fscore, p_at_1)
def _test_rl_transe(self, dataset, params): #Load Graph Data graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.RLTransE.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() result_prob = [] for (a, b) in graph.entity_pairs: a_triples = [(h, t, r) for (h, t, r) in graph.triples if h == a] b_triples = [(h, t, r) for (h, t, r) in graph.triples if h == b] distance = abs( spatial.distance.cosine(ent_embeddings[a], ent_embeddings[b])) for (ah, at, ar) in a_triples: bt = [t for (h, t, r) in b_triples if r == ar] if len(bt): distance = distance + abs(spatial.distance.cosine(\ ent_embeddings[at], ent_embeddings[bt[0]])) result_prob.append((a, b, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a, b) in graph.true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'RLTransE', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'RLTransE', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs, max_threshold=3.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() return (max_fscore, precison_at_1)
def _test_etranse(self, dataset, params): model = dataset() graph = Graph_ERER(dataset) logger = get_logger("RL.Test.erer.ETransE." + str(model)) etranse = ETransE(graph, dimension=params['dimension'], batchSize=params['batchSize'], learning_rate=params['learning_rate'], margin=params['margin'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate'], regularizer_scale=params['regularizer_scale'], alpha=params['alpha'], beta=params['beta']) etranse.train(max_epochs=params['max_epochs']) ent_embeddings_a = etranse.get_ent_embeddings_A() ent_embeddings_b = etranse.get_ent_embeddings_B() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings_a[int(graph.entity_pairs[i][0])], ent_embeddings_b[int(graph.entity_pairs[i][1])])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('erer', str(model), 'ETransE', graph.entityA, ent_embeddings_a) export_embeddings('erer', str(model), 'ETransE', graph.entityB, ent_embeddings_b) export_result_prob(dataset, 'erer', str(model), 'ETransE', graph.entityA, result_prob, graph.true_pairs, graph.entityB) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) prec_at_1 = ir_metrics.log_metrics(logger, params) etranse.close_tf_session() return (max_fscore, prec_at_1)
def _test_erer(self, dataset, er_algo, params): model = dataset() graph = Graph_ERER(dataset) graph_er = graph.get_er_model() er_model = er_algo(graph_er, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = er_model.train(max_epochs=params['epochs']) logger = get_logger('RL.Test.erer.ERER.' + str(model) + "." + str(er_model)) logger.info("Training Complete with loss: %f", loss) ent_embeddings = er_model.get_ent_embeddings() result_prob = [] for i in range(0, len(graph_er.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph_er.entity_pairs[i][0]], ent_embeddings[graph_er.entity_pairs[i][1]])) result_prob.append((graph_er.entity_pairs[i][0], graph_er.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph_er.entity_pairs[i] in graph_er.true_pairs) #Write Embeddings to file export_embeddings("erer", str(model), str(er_model), graph_er.entity, ent_embeddings) export_result_prob(dataset, 'erer', str(model), str(er_model), graph_er.entity, result_prob, graph_er.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph_er.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph_er.true_pairs, len(graph_er.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph_er.true_pairs) ir_metrics.log_metrics(logger) er_model.close_tf_session() return max_fscore
def _test_seea(self, dataset, params): model = dataset() graph = Graph_EAR(dataset) logger = get_logger('RL.Test.ear.SEEA.' + str(model)) seea = SEEA(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], batchSize=params['batchSize'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) #Begin SEEA iterations, passing true pairs only to debug the alignments. results = seea.seea_iterate(beta=params['beta'], max_iter=params['max_iter'], max_epochs=params['max_epochs']) try: result_pairs = pd.MultiIndex.from_tuples(results) fscore = log_quality_results(logger, result_pairs, graph.true_pairs, len(graph.entity_pairs), params) except Exception as e: logger.error(e) logger.info("No Aligned pairs found.") ent_embeddings = seea.get_ent_embeddings() export_embeddings('ear', str(model), 'SEEA', graph.entity, ent_embeddings) result_prob = [] for (e1, e2) in graph.entity_pairs: distance = abs( spatial.distance.cosine(ent_embeddings[e1], ent_embeddings[e2])) result_prob.append((e1, e2, distance)) export_result_prob(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs) try: export_false_negatives(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs, result_pairs, graph.entity) export_false_positives(dataset, 'ear', str(model), 'SEEA', graph.entity, result_prob, graph.true_pairs, result_pairs, graph.entity) except Exception as e: logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) prec_at_1 = ir_metrics.log_metrics(logger, params) seea.close_tf_session() return (fscore, prec_at_1)
def test_cora(self): logger = get_logger('RL.Test.ECMClassifier.CORA') #Read Train data in dataset A & B cora = Cora() ## Extarct Features compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.candidate_links, cora.trainDataA, cora.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.ECMClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, cora.true_links, len(cora.candidate_links)) #validate the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.val_links, cora.valDataA, cora.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_val_links, len(cora.val_links)) #Test the classifier compare_cl = cora.get_comparision_object() features = compare_cl.compute(cora.test_links, cora.testDataA, cora.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, cora.true_test_links, len(cora.test_links)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, cora.true_test_links) ir_metrics.log_metrics(logger)
def _test_logistic_transh(self, dataset, params): """Note: Zero aligned pairs are returned, require fixation.""" model = dataset() logger = get_logger('RL.Test.LogisticTransH.' + str(model)) entity, relation, triples, entity_pairs, true_pairs = model.get_er_model( ) transh = TransH(entity, relation, triples, entity_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() ent_embeddings = [ np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0]) ] trainDataA = pd.DataFrame(data=ent_embeddings) trainDataB = pd.DataFrame(data=ent_embeddings) compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i), method='gauss') candidate_links = pd.MultiIndex.from_tuples(entity_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Features %s", str(features.describe())) logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, true_pairs) result = logrg.predict(features) log_quality_results(logger, result, true_pairs, len(entity_pairs)) prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) ir_metrics.log_metrics(logger, params)
def test_febrl(self): logger = get_logger('RL.Test.LogisticRegression.FEBRL') febrl = FEBRL() compare_cl = febrl.get_comparision_object() features = compare_cl.compute(febrl.candidate_links, febrl.trainDataA, febrl.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, febrl.true_links) result = logrg.predict(features) log_quality_results(logger, result, febrl.true_links, len(febrl.candidate_links)) #Validate the classifier compare_cl = febrl.get_comparision_object() features = compare_cl.compute(febrl.val_links, febrl.valDataA, febrl.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, febrl.true_val_links, len(febrl.val_links)) #Test the classifier compare_cl = febrl.get_comparision_object() features = compare_cl.compute(febrl.test_links, febrl.testDataA, febrl.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, febrl.true_test_links, len(febrl.test_links)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(febrl.test_links[i][0], febrl.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, febrl.true_test_links) ir_metrics.log_metrics(logger)
def test_mean_precision_at_k(self): result_prob = [(0, 1, 0.1), (0, 2, 0.3), (1, 2, 0.5), (1, 4, 0.2), (2, 4, 0.9), (2, 3, 1)] true_pairs = [(0, 1), (2, 4)] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) self.assertEqual(ir_metrics.get_mean_precisison_at_k(k=1), 1) self.assertEqual(ir_metrics.get_mean_precisison_at_k(k=2), 0.5) result_prob = [(0, 1, 0.9), (1, 2, 0.4), (2, 3, 0.5), (0, 2, 0.2), (0, 3, 0.5)] true_pairs = [(0, 1)] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) self.assertEqual(ir_metrics.get_mean_precisison_at_k(k=1), 0) self.assertEqual(ir_metrics.get_mean_precisison_at_k(k=2), 0) self.assertEqual(round(ir_metrics.get_mean_precisison_at_k(k=3), 2), 0.33)
def test_febrl(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(FEBRL) model = FEBRL() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching postcode_rel_id = graph.relation.index("postcode") result_prob = [] for i in range(0, len(graph.entity_pairs)): person_A = graph.entity_pairs[i][0] person_B = graph.entity_pairs[i][1] postcode_A = [ t for (h, t, r) in graph.triples if h == person_A and r == postcode_rel_id ][0] neighbours_A = [ h for (h, t, r) in graph.triples if t == postcode_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in neighbours_A])) postcode_B = [ t for (h, t, r) in graph.triples if h == person_B and r == postcode_rel_id ][0] neighbours_B = [ h for (h, t, r) in graph.triples if t == postcode_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in neighbours_B])) cost_matrix = np.zeros(shape=(len(neighbours_A), len(neighbours_B))) for i in range(len(neighbours_A)): for j in range(len(neighbours_B)): if neighbours_A[i] == neighbours_B[j]: cost_matrix[i][j] = 100 else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[neighbours_A[i]], ent_embeddings[neighbours_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) person_A_index = neighbours_A.index(person_A) person_B_index = neighbours_B.index(person_B) distance = cost_matrix[row_ind, col_ind].sum( ) + cost_matrix[person_A_index][person_B_index] #import ipdb;ipdb.set_trace() #if (person_A_index, person_B_index) not in (row_ind, col_ind): # distance = distance + 1000 result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(FEBRL, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_census(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Census) model = Census() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): #logger.info("Computing cost for: %s", str([graph.entity[e] for e in graph.entity_pairs[ep_index]])) household_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r > 6 ][0] family_members_A = [ h for (h, t, r) in graph.triples if t == household_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in family_members_A])) household_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r > 6 ][0] family_members_B = [ h for (h, t, r) in graph.triples if t == household_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in family_members_B])) cost_matrix = np.zeros(shape=(len(family_members_A), len(family_members_B))) for i in range(len(family_members_A)): for j in range(len(family_members_B)): #if family_members_A[i] == family_members_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[family_members_A[i]], ent_embeddings[family_members_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) #logger.info("Rows selected %s, Col selected: %s", str(row_ind), str(col_ind)) eA_index = family_members_A.index(graph.entity_pairs[ep_index][0]) eB_index = family_members_B.index(graph.entity_pairs[ep_index][1]) #logger.info("A index: %d, B index: %d", eA_index, eB_index) rowA = np.where(row_ind == eA_index)[0] if len(rowA) and col_ind[rowA[0]] == eB_index: #logger.info("Pair in min. cost matrix") distance = cost_matrix[row_ind, col_ind].sum() else: distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) if ep_index % 1000 == 0: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) #if graph.entity_pairs[ep_index] in graph.true_pairs: # import ipdb;ipdb.set_trace() #Normalize distance max_distance = 10 #for r in result_prob: # if r[2] > max_distance: # max_distance = r[2] result_prob = [(r[0], r[1], (r[2] / max_distance)) for r in result_prob] #logger.info("Max distance: %f", max_distance) for r in result_prob[:100]: logger.info("distance: %f true_pairs: %s", r[2], (r[0], r[1]) in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Census, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_ecm(self): logger = get_logger('RL.Test.ECMClassifier.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.ECMClassifier() logrg.fit(features) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("ECM weights: %s", str(logrg.weights)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) start_time = timeit.default_timer() entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] logger.info("Entities built in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] logger.info("Result prob in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] logger.info("true_links in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() export_result_prob(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, entitiesB) logger.info("Result prob EXPORTED in %s", str(timeit.default_timer() - start_time)) start_time = timeit.default_timer() result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'ECM', 'census', 'ecm', entitiesA, result_prob, true_links, result, entitiesB) logger.info("FP & FN EXPORTED in %s", str(timeit.default_timer() - start_time)) result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'ECM', 'census', 'ecm', entitiesA, result_feature_mapping, entitiesB) logger.info("Exported Human Readable Results")
def test_census_new(self): c = Census() graph = Graph_VEG(Census) logger = get_logger("RL.Test.LogisticRLTransE.Census") logger.info("values for name : %s", str(graph.relation_value_map[graph.relation[1]][:10])) logger.info("relation: %s", str(graph.relation)) logger.info("train_triples: %s", str(graph.train_triples[:10])) logger.info("set train_triples size %d", len(set(graph.train_triples))) params = self.get_default_params() transe = RLTransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss, val_loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f val_loss: %f", loss, val_loss) value_embeddings = transe.get_val_embeddings() relation_embeddings = transe.get_rel_embeddings() #Map of feilds in census dataFrame to VEG relations. field_relation_map = { c.field_map[CensusFields.FIRST_NAME]: "name", c.field_map[CensusFields.SURNAME_1]: "surname", c.field_map[CensusFields.SURNAME_2]: "surname2", c.field_map[CensusFields.YOB]: "yob", c.field_map[CensusFields.CIVIL_STATUS]: "civil", c.field_map[CensusFields.OCCUPATION]: "occupation", c.field_map[CensusFields.RELATION]: "relation" } missing_values = [] train_features = [] #Size samples*(dimension*rel_count) test_features = [] for (candidate_links, dataA, dataB, features) in \ [(c.candidate_links, c.trainDataA, c.trainDataB, train_features), (c.test_links, c.testDataA, c.testDataB, test_features)]: for (a, b) in candidate_links: row_a = dataA.loc[a] row_b = dataB.loc[b] distance = [] for f in field_relation_map: val_a = row_a[f] val_b = row_b[f] if val_a != val_b: rel = field_relation_map[f] try: val_index_a = graph.relation_value_map[rel].index( val_a) except ValueError: missing_values.append(val_a) distance.extend([1.0] * params['dimension']) continue try: val_index_b = graph.relation_value_map[rel].index( val_b) except ValueError: missing_values.append(val_b) distance.extend([1.0] * params['dimension']) continue rel_index = graph.relation.index(field_relation_map[f]) distance.extend(value_embeddings[rel][val_index_a] + \ relation_embeddings[rel_index] - value_embeddings[rel][val_index_b]) features.append(pd.Series(distance).rename((a, b))) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in c.true_test_links) logger.info("No. of missing values: %d", len(missing_values)) logger.info("Unique No. of missing values: %d", len(set(missing_values))) train_features = pd.DataFrame(data=train_features).fillna(1) test_features = pd.DataFrame(data=test_features).fillna(1) logger.info("Shape of Train features: %s", str(train_features.shape)) logger.info("Shape of Test features: %s", str(test_features.shape)) #Train Logistic Regression Model logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(train_features, c.true_links) result = logrg.predict(train_features) result = pd.MultiIndex.from_tuples(result.to_series()) log_quality_results(logger, result, c.true_links, len(c.candidate_links), params) #Test Classifier result = logrg.predict(test_features) result = pd.MultiIndex.from_tuples(result.to_series()) log_quality_results(logger, result, c.true_test_links, len(c.test_links), params) """ Todo: Export Embeddings and probabilities. try: entities = ["value\trelation"] for r in graph.relation_value_map: for v in graph.relation_value_map[r]: entities.append("\t".join([v,r])) embeddings = [] for rel in value_embeddings: val_count = len(graph.relation_value_map[rel]) embeddings.extend(value_embeddings[rel][:val_count]) #Write Embeddings to file export_embeddings('veg', str(c), 'LogisticRLTransE', entities, embeddings) except Exception as e: logger.error("Failed to export embeddings") logger.error(e) export_result_prob(Census, 'veg', str(c), 'RLTransE', graph.values, result_prob, c.true_test_links) """ prob_series = logrg.prob(test_features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(c.test_links[i][0], c.test_links[i][1], prob[i]) for i in range(0, len(prob))] #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, c.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session()
def _test_rl_transe(self, model, field_relation_map, params): dataset = model() graph = Graph_VEG(model) logger = get_logger("RL.Test.RLTransE." + str(dataset)) logger.info("values for name : %s", str(graph.relation_value_map[graph.relation[1]][:10])) logger.info("relation: %s", str(graph.relation)) logger.info("train_triples: %s", str(graph.train_triples[:10])) logger.info("set train_triples size %d", len(set(graph.train_triples))) transe = RLTransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss, val_loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f val_loss: %f", loss, val_loss) value_embeddings = transe.get_val_embeddings() relation_embeddings = transe.get_rel_embeddings() result_prob = [] distance_distribution = [] missing_values = [] for (a, b) in dataset.test_links: row_a = dataset.testDataA.loc[a] row_b = dataset.testDataB.loc[b] distance = 0 dd = [] for f in field_relation_map: val_a = row_a[f] val_b = row_b[f] if val_a == val_b: dd.append(0) else: rel = field_relation_map[f] try: val_index_a = graph.relation_value_map[rel].index( val_a) except ValueError: missing_values.append(val_a) distance = distance + 1 dd.append(1) continue try: val_index_b = graph.relation_value_map[rel].index( val_b) except ValueError: missing_values.append(val_b) distance = distance + 1 dd.append(1) continue rel_index = graph.relation.index(field_relation_map[f]) cur_distance = abs( spatial.distance.cosine( value_embeddings[rel][val_index_a] + relation_embeddings[rel_index], value_embeddings[rel][val_index_b])) distance = distance + cur_distance dd.append(cur_distance) result_prob.append((a, b, distance)) distance_distribution.append((a, b, dd, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in dataset.true_test_links) logger.info("No. of missing values: %d", len(missing_values)) logger.info("Unique No. of missing values: %d", len(set(missing_values))) try: entities = ["value\trelation"] for r in graph.relation_value_map: for v in graph.relation_value_map[r]: entities.append("\t".join([v, r])) embeddings = [] for rel in value_embeddings: val_count = len(graph.relation_value_map[rel]) embeddings.extend(value_embeddings[rel][:val_count]) #Write Embeddings to file export_embeddings('veg', str(dataset), 'RLTransE_val', entities, embeddings) export_embeddings('veg', str(dataset), 'RLTransE_rel', graph.relation, relation_embeddings) except Exception as e: logger.error("Failed to export embeddings") logger.error(e) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=3.0, step=0.02) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() #Export False Positives and result porobabilities get_entity_name = lambda d, i: "_".join([ str(d.iloc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][dataset.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda d, i: "_".join([ str(d.loc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][dataset.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(dataset.testDataA, i) for i in range(int(dataset.testDataA.shape[0])) ] entitiesB = [ get_entity_name(dataset.testDataB, i) for i in range(int(dataset.testDataB.shape[0])) ] result_prob = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), p) for (a, b, p) in result_prob ] true_links = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in dataset.true_test_links ] export_result_prob(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, entitiesB) distance_distribution = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), [str("%.2f" % (float(w))) for w in dd], 1 - d) for (e1, e2, dd, d) in distance_distribution if (e1, e2) in result ] export_human_readable_results(Census, 'veg', 'census', 'rltranse', entitiesA, distance_distribution, entitiesB) result = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in result ] export_false_negatives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) return (max_fscore, precison_at_1)
def test_logistic(self): logger = get_logger('RL.Test.LogisticRegression.Census') census = Census() compare_cl = census.get_comparision_object() features = compare_cl.compute(census.candidate_links, census.trainDataA, census.trainDataB) logger.info("Train Features %s", str(features.describe())) # Train ECM Classifier logrg = recordlinkage.LogisticRegressionClassifier() logrg.fit(features, census.true_links) result = logrg.predict(features) log_quality_results(logger, result, census.true_links, len(census.candidate_links)) #Validate the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.val_links, census.valDataA, census.valDataB) logger.info("Validation Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_val_links, len(census.val_links)) #Test the classifier compare_cl = census.get_comparision_object() features = compare_cl.compute(census.test_links, census.testDataA, census.testDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, census.true_test_links, len(census.test_links)) logger.info("logrg coefficients: %s", str(logrg.coefficients)) #Log IR Stats: MRR, MAP, MP@K prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(census.test_links[i][0], census.test_links[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, census.true_test_links) ir_metrics.log_metrics(logger) #Export False Positives and result porobabilities result_feature_mapping = [ (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] get_entity_name = lambda c, d, i: "_".join([ str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][c.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda c, d, i: "_".join([ str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][c.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(census, census.testDataA, i) for i in range(int(census.testDataA.shape[0])) ] entitiesB = [ get_entity_name(census, census.testDataB, i) for i in range(int(census.testDataB.shape[0])) ] result_prob = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), p) for (a, b, p) in result_prob] true_links = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in census.true_test_links] export_result_prob(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, entitiesB) result = [(entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b)))) for (a, b) in result] export_false_negatives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_prob, true_links, result, entitiesB) weights = logrg.coefficients result = [ (e1, e2, [str("%.2f" % (float(d * w) / sum(weights))) for w in weights], d) for (e1, e2, d) in result_prob if (e1, e2) in result ] result_feature_mapping = [ (entitiesA.index( get_entity_name_loc(census, census.testDataA, int(a))), entitiesB.index( get_entity_name_loc(census, census.testDataB, int(b))), w, p) for (a, b, w, p) in result_feature_mapping ] export_human_readable_results(Census, 'LogisticRegression', 'census', 'logistic', entitiesA, result_feature_mapping, entitiesB)
def test_veer(self): logger = get_logger('RL.Test.VEER.Census') dataset = Census() #Columns of interest for Sant Feliu town columns = [ 'Noms_harmo', 'cognom_1', 'cohort', 'estat_civil', 'parentesc_har', 'ocupacio_hisco' ] params = { 'learning_rate': 0.1, 'margin': 0.1, 'dimension': 32, 'epochs': 50, 'regularizer_scale': 0.1, 'batchSize': 512 } veer = VEER(Census, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Export embeddings embeddings = veer.get_val_embeddings() export_embeddings('veg', 'census', 'veer', veer.values, embeddings) #Write Result Prob to file result_feature_mapping = [(e1, e2, [ str( abs( spatial.distance.cosine( embeddings[veer.values.index( veer._clean(dataset.testDataA.loc[e1][c]))], embeddings[veer.values.index( veer._clean(dataset.testDataB.loc[e2][c]))]))) for c in columns ], d) for (e1, e2, d) in result_prob if (e1, e2) in result] entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, entitiesB) export_false_negatives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) export_false_positives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) result_feature_mapping = [(index_dictA[str(a)], index_dictB[str(b)], w, p) for (a, b, w, p) in result_feature_mapping] export_human_readable_results(Census, 'veg', str(dataset), 'VEER', entitiesA, result_feature_mapping, entitiesB) veer.close_tf_session()
def test_cora(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Cora) model = Cora() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching auth_rel_index = graph.relation.index('author') result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): authors_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r == auth_rel_index ] #logger.info("AUHTORS A: %s", str([graph.entity[a] for a in authors_A])) authors_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r == auth_rel_index ] #logger.info("AUHTORS B: %s", str([graph.entity[a] for a in authors_B])) cost_matrix = np.zeros(shape=(len(authors_A), len(authors_B))) for i in range(len(authors_A)): for j in range(len(authors_B)): #if authors_A[i] == authors_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine(ent_embeddings[authors_A[i]], ent_embeddings[authors_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append((graph.entity_pairs[ep_index][0], graph.entity_pairs[ep_index][1], distance)) if distance <= 0.05: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Cora, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_mean_average_precision(self): result_prob = [(0, 1, 0.1), (0, 2, 0.3), (1, 2, 0.5), (2, 3, 0.2), (2, 4, 0.9)] true_pairs = [(0, 1), (2, 4)] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) self.assertEqual(ir_metrics.get_mean_average_precision(), 0.75) ir_metrics = InformationRetrievalMetrics(result_prob[:4], true_pairs[:1]) self.assertEqual(ir_metrics.get_mean_average_precision(), 1) result_prob = [ (0, 1, 0.1), (0, 2, 0.2), (0, 3, 0.3), (0, 4, 0.4), (0, 5, 0.5), (0, 6, 0.6), (1, 0, 0.1), (1, 2, 0.2), (1, 3, 0.3), (1, 4, 0.4), (1, 5, 0.5), (1, 6, 0.6), ] true_pairs = [(0, 1), (0, 4), (0, 5), (0, 6), (1, 4), (1, 5), (1, 6)] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) self.assertEqual(round(ir_metrics.get_mean_average_precision(), 2), 0.54) result_prob = [(0, 2, 0.1), (0, 1, 0.2), (2, 3, 0.1), (2, 4, 0.5), (3, 1, 0.2), (3, 2, 0.4), (3, 4, 0.8)] true_pairs = [(0, 1), (2, 3), (3, 4)] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) self.assertEqual(round(ir_metrics.get_mean_average_precision(), 2), 0.61)
def _test_veer(self, model, columns, params): #Load Graph Data dataset = model() logger = get_logger('RL.Test.VEER.' + str(dataset)) veer = VEER(model, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Write Result Prob to file entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] true_links = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in dataset.true_test_links] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, entitiesB) result = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in result] export_false_negatives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) veer.close_tf_session() return (max_fscore, precison_at_1)
def _test_logistic_transh_erer(self, dataset, params): model = dataset() logger = get_logger('RL.Test.erer.LogisticTransH.ERER.' + str(model)) entA, entB, relA, relB, triA, triB, entity_pairs, prior_pairs, true_pairs = model.get_erer_model( ) self.assertTrue(all([(tp in entity_pairs) for tp in true_pairs])) #Generate embeddings for datasetA transh = TransH(entA, relA, triA, prior_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddingsA = transh.get_ent_embeddings() transh.close_tf_session() del transh #Generate embeddings for datasetB transh = TransH(entB, relB, triB, entity_pairs, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddingsB = transh.get_ent_embeddings() transh.close_tf_session() ent_embeddingsA = [ np.array(ent_embeddingsA[i]) for i in range(ent_embeddingsA.shape[0]) ] ent_embeddingsB = [ np.array(ent_embeddingsB[i]) for i in range(ent_embeddingsB.shape[0]) ] trainDataA = pd.DataFrame(data=ent_embeddingsA) trainDataB = pd.DataFrame(data=ent_embeddingsB) #Define comparision Class compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i)) #method='exp') #sample negative pairs train_pairs = [] tuple_pp = set(map(tuple, prior_pairs)) logger.info("Number of prior_pairs: %d", len(prior_pairs)) for e1, e2 in prior_pairs: train_pairs.append((e1, e2)) while True: neg_e2 = random.choice(xrange(0, len(entB))) if neg_e2 == e2 or (e1, neg_e2) in tuple_pp: continue else: train_pairs.append((e1, neg_e2)) break logger.info("Number of Train Pairs: %d", len(train_pairs)) candidate_links = pd.MultiIndex.from_tuples(train_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Train Features %s", str(features.describe())) #Train Logistic Regression Model logrg = recordlinkage.LogisticRegressionClassifier() candidate_links = pd.MultiIndex.from_tuples(prior_pairs) logrg.fit(features, candidate_links) #Test Classifier compare_cl = recordlinkage.Compare() for i in range(0, params['dimension']): compare_cl.numeric(i, i, label=str(i)) candidate_links = pd.MultiIndex.from_tuples(entity_pairs) features = compare_cl.compute(candidate_links, trainDataA, trainDataB) logger.info("Test Features %s", str(features.describe())) result = logrg.predict(features) log_quality_results(logger, result, true_pairs, len(entity_pairs)) prob_series = logrg.prob(features) prob = [(1 - p) for p in prob_series.tolist()] result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i]) for i in range(0, len(prob))] ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs) ir_metrics.log_metrics(logger, params, params) #Export results export_embeddings('erer', str(model), 'LogTransH', entA, ent_embeddingsA) export_embeddings('erer', str(model), 'LogTransH', entB, ent_embeddingsB) export_result_prob(dataset, 'erer', str(model), 'LogTransH', entA, result_prob, true_pairs, entB)