def _test_transh(self, dataset, params): graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.TransH.' + str(model)) transh = TransH(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transh.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transh.get_ent_embeddings() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[i][0]], ent_embeddings[graph.entity_pairs[i][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'TransH', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: logger.info("MAX FSCORE: %f AT : %f", max_fscore, optimal_threshold) result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) params['threshold'] = optimal_threshold log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) export_false_negatives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) export_false_positives(dataset, 'er', str(model), 'TransH', graph.entity, result_prob, graph.true_pairs, result, graph.entity) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) p_at_1 = ir_metrics.log_metrics(logger, params) transh.close_tf_session() return (max_fscore, p_at_1)
def _test_rl_transe(self, dataset, params): #Load Graph Data graph = Graph_ER(dataset) model = dataset() logger = get_logger('RL.Test.er.RLTransE.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() result_prob = [] for (a, b) in graph.entity_pairs: a_triples = [(h, t, r) for (h, t, r) in graph.triples if h == a] b_triples = [(h, t, r) for (h, t, r) in graph.triples if h == b] distance = abs( spatial.distance.cosine(ent_embeddings[a], ent_embeddings[b])) for (ah, at, ar) in a_triples: bt = [t for (h, t, r) in b_triples if r == ar] if len(bt): distance = distance + abs(spatial.distance.cosine(\ ent_embeddings[at], ent_embeddings[bt[0]])) result_prob.append((a, b, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a, b) in graph.true_pairs) #Write Embeddings to file export_embeddings('er', str(model), 'RLTransE', graph.entity, ent_embeddings) export_result_prob(dataset, 'er', str(model), 'RLTransE', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs, max_threshold=3.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() return (max_fscore, precison_at_1)
def _test_etranse(self, dataset, params): model = dataset() graph = Graph_ERER(dataset) logger = get_logger("RL.Test.erer.ETransE." + str(model)) etranse = ETransE(graph, dimension=params['dimension'], batchSize=params['batchSize'], learning_rate=params['learning_rate'], margin=params['margin'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate'], regularizer_scale=params['regularizer_scale'], alpha=params['alpha'], beta=params['beta']) etranse.train(max_epochs=params['max_epochs']) ent_embeddings_a = etranse.get_ent_embeddings_A() ent_embeddings_b = etranse.get_ent_embeddings_B() result_prob = [] for i in range(0, len(graph.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings_a[int(graph.entity_pairs[i][0])], ent_embeddings_b[int(graph.entity_pairs[i][1])])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs) #Write Embeddings to file export_embeddings('erer', str(model), 'ETransE', graph.entityA, ent_embeddings_a) export_embeddings('erer', str(model), 'ETransE', graph.entityB, ent_embeddings_b) export_result_prob(dataset, 'erer', str(model), 'ETransE', graph.entityA, result_prob, graph.true_pairs, graph.entityB) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) prec_at_1 = ir_metrics.log_metrics(logger, params) etranse.close_tf_session() return (max_fscore, prec_at_1)
def _test_erer(self, dataset, er_algo, params): model = dataset() graph = Graph_ERER(dataset) graph_er = graph.get_er_model() er_model = er_algo(graph_er, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = er_model.train(max_epochs=params['epochs']) logger = get_logger('RL.Test.erer.ERER.' + str(model) + "." + str(er_model)) logger.info("Training Complete with loss: %f", loss) ent_embeddings = er_model.get_ent_embeddings() result_prob = [] for i in range(0, len(graph_er.entity_pairs)): distance = abs( spatial.distance.cosine( ent_embeddings[graph_er.entity_pairs[i][0]], ent_embeddings[graph_er.entity_pairs[i][1]])) result_prob.append((graph_er.entity_pairs[i][0], graph_er.entity_pairs[i][1], distance)) #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph_er.entity_pairs[i] in graph_er.true_pairs) #Write Embeddings to file export_embeddings("erer", str(model), str(er_model), graph_er.entity, ent_embeddings) export_result_prob(dataset, 'erer', str(model), str(er_model), graph_er.entity, result_prob, graph_er.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph_er.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph_er.true_pairs, len(graph_er.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph_er.true_pairs) ir_metrics.log_metrics(logger) er_model.close_tf_session() return max_fscore
def _test_veer(self, model, columns, params): #Load Graph Data dataset = model() logger = get_logger('RL.Test.VEER.' + str(dataset)) veer = VEER(model, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Write Result Prob to file entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] true_links = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in dataset.true_test_links] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, entitiesB) result = [(index_dictA[str(a)], index_dictB[str(b)]) for (a, b) in result] export_false_negatives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(model, 'veg', str(dataset), 'VEER', entitiesA, result_prob, true_links, result, entitiesB) veer.close_tf_session() return (max_fscore, precison_at_1)
def test_cora(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Cora) model = Cora() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching auth_rel_index = graph.relation.index('author') result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): authors_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r == auth_rel_index ] #logger.info("AUHTORS A: %s", str([graph.entity[a] for a in authors_A])) authors_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r == auth_rel_index ] #logger.info("AUHTORS B: %s", str([graph.entity[a] for a in authors_B])) cost_matrix = np.zeros(shape=(len(authors_A), len(authors_B))) for i in range(len(authors_A)): for j in range(len(authors_B)): #if authors_A[i] == authors_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine(ent_embeddings[authors_A[i]], ent_embeddings[authors_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append((graph.entity_pairs[ep_index][0], graph.entity_pairs[ep_index][1], distance)) if distance <= 0.05: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Cora, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_febrl(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(FEBRL) model = FEBRL() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching postcode_rel_id = graph.relation.index("postcode") result_prob = [] for i in range(0, len(graph.entity_pairs)): person_A = graph.entity_pairs[i][0] person_B = graph.entity_pairs[i][1] postcode_A = [ t for (h, t, r) in graph.triples if h == person_A and r == postcode_rel_id ][0] neighbours_A = [ h for (h, t, r) in graph.triples if t == postcode_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in neighbours_A])) postcode_B = [ t for (h, t, r) in graph.triples if h == person_B and r == postcode_rel_id ][0] neighbours_B = [ h for (h, t, r) in graph.triples if t == postcode_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in neighbours_B])) cost_matrix = np.zeros(shape=(len(neighbours_A), len(neighbours_B))) for i in range(len(neighbours_A)): for j in range(len(neighbours_B)): if neighbours_A[i] == neighbours_B[j]: cost_matrix[i][j] = 100 else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[neighbours_A[i]], ent_embeddings[neighbours_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) person_A_index = neighbours_A.index(person_A) person_B_index = neighbours_B.index(person_B) distance = cost_matrix[row_ind, col_ind].sum( ) + cost_matrix[person_A_index][person_B_index] #import ipdb;ipdb.set_trace() #if (person_A_index, person_B_index) not in (row_ind, col_ind): # distance = distance + 1000 result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(FEBRL, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def test_census(self, params=None): if not params: params = self.get_default_params() #Load Graph Data graph = Graph_ER(Census) model = Census() logger = get_logger('RL.Test.TransE.Household.' + str(model)) transe = TransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f", loss) ent_embeddings = transe.get_ent_embeddings() #Experimenting household matching result_prob = [] for ep_index in range(0, len(graph.entity_pairs)): #logger.info("Computing cost for: %s", str([graph.entity[e] for e in graph.entity_pairs[ep_index]])) household_A = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][0] and r > 6 ][0] family_members_A = [ h for (h, t, r) in graph.triples if t == household_A ] #logger.info("FM A: %s", str([graph.entity[a] for a in family_members_A])) household_B = [ t for (h, t, r) in graph.triples if h == graph.entity_pairs[ep_index][1] and r > 6 ][0] family_members_B = [ h for (h, t, r) in graph.triples if t == household_B ] #logger.info("FM B: %s", str([graph.entity[a] for a in family_members_B])) cost_matrix = np.zeros(shape=(len(family_members_A), len(family_members_B))) for i in range(len(family_members_A)): for j in range(len(family_members_B)): #if family_members_A[i] == family_members_B[j]: # cost_matrix[i][j] = 100 #else: cost_matrix[i][j] = abs( spatial.distance.cosine( ent_embeddings[family_members_A[i]], ent_embeddings[family_members_B[j]])) #logger.info("Cost Matrix: %s", str(cost_matrix)) row_ind, col_ind = linear_sum_assignment(cost_matrix) #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum()) #logger.info("Rows selected %s, Col selected: %s", str(row_ind), str(col_ind)) eA_index = family_members_A.index(graph.entity_pairs[ep_index][0]) eB_index = family_members_B.index(graph.entity_pairs[ep_index][1]) #logger.info("A index: %d, B index: %d", eA_index, eB_index) rowA = np.where(row_ind == eA_index)[0] if len(rowA) and col_ind[rowA[0]] == eB_index: #logger.info("Pair in min. cost matrix") distance = cost_matrix[row_ind, col_ind].sum() else: distance = cost_matrix[row_ind, col_ind].sum() + abs( spatial.distance.cosine( ent_embeddings[graph.entity_pairs[ep_index][0]], ent_embeddings[graph.entity_pairs[ep_index][1]])) result_prob.append( (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance)) if ep_index % 1000 == 0: logger.info("i: %d, distance: %f true_pairs: %s", ep_index, distance, graph.entity_pairs[ep_index] in graph.true_pairs) #if graph.entity_pairs[ep_index] in graph.true_pairs: # import ipdb;ipdb.set_trace() #Normalize distance max_distance = 10 #for r in result_prob: # if r[2] > max_distance: # max_distance = r[2] result_prob = [(r[0], r[1], (r[2] / max_distance)) for r in result_prob] #logger.info("Max distance: %f", max_distance) for r in result_prob[:100]: logger.info("distance: %f true_pairs: %s", r[2], (r[0], r[1]) in graph.true_pairs) export_embeddings('er', str(model), 'TransE.Household', graph.entity, ent_embeddings) export_result_prob(Census, 'er', str(model), 'TransE.Household', graph.entity, result_prob, graph.true_pairs) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, graph.true_pairs) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, graph.true_pairs, len(graph.entity_pairs), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs) ir_metrics.log_metrics(logger, params) transe.close_tf_session() return max_fscore
def _test_rl_transe(self, model, field_relation_map, params): dataset = model() graph = Graph_VEG(model) logger = get_logger("RL.Test.RLTransE." + str(dataset)) logger.info("values for name : %s", str(graph.relation_value_map[graph.relation[1]][:10])) logger.info("relation: %s", str(graph.relation)) logger.info("train_triples: %s", str(graph.train_triples[:10])) logger.info("set train_triples size %d", len(set(graph.train_triples))) transe = RLTransE(graph, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize'], neg_rate=params['neg_rate'], neg_rel_rate=params['neg_rel_rate']) loss, val_loss = transe.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f val_loss: %f", loss, val_loss) value_embeddings = transe.get_val_embeddings() relation_embeddings = transe.get_rel_embeddings() result_prob = [] distance_distribution = [] missing_values = [] for (a, b) in dataset.test_links: row_a = dataset.testDataA.loc[a] row_b = dataset.testDataB.loc[b] distance = 0 dd = [] for f in field_relation_map: val_a = row_a[f] val_b = row_b[f] if val_a == val_b: dd.append(0) else: rel = field_relation_map[f] try: val_index_a = graph.relation_value_map[rel].index( val_a) except ValueError: missing_values.append(val_a) distance = distance + 1 dd.append(1) continue try: val_index_b = graph.relation_value_map[rel].index( val_b) except ValueError: missing_values.append(val_b) distance = distance + 1 dd.append(1) continue rel_index = graph.relation.index(field_relation_map[f]) cur_distance = abs( spatial.distance.cosine( value_embeddings[rel][val_index_a] + relation_embeddings[rel_index], value_embeddings[rel][val_index_b])) distance = distance + cur_distance dd.append(cur_distance) result_prob.append((a, b, distance)) distance_distribution.append((a, b, dd, distance)) #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in dataset.true_test_links) logger.info("No. of missing values: %d", len(missing_values)) logger.info("Unique No. of missing values: %d", len(set(missing_values))) try: entities = ["value\trelation"] for r in graph.relation_value_map: for v in graph.relation_value_map[r]: entities.append("\t".join([v, r])) embeddings = [] for rel in value_embeddings: val_count = len(graph.relation_value_map[rel]) embeddings.extend(value_embeddings[rel][:val_count]) #Write Embeddings to file export_embeddings('veg', str(dataset), 'RLTransE_val', entities, embeddings) export_embeddings('veg', str(dataset), 'RLTransE_rel', graph.relation, relation_embeddings) except Exception as e: logger.error("Failed to export embeddings") logger.error(e) optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=3.0, step=0.02) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except: logger.info("Zero Reults") #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) transe.close_tf_session() #Export False Positives and result porobabilities get_entity_name = lambda d, i: "_".join([ str(d.iloc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.iloc[i][dataset.field_map[CensusFields.DNI]]) ]) get_entity_name_loc = lambda d, i: "_".join([ str(d.loc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]), str(d.loc[i][dataset.field_map[CensusFields.DNI]]) ]) entitiesA = [ get_entity_name(dataset.testDataA, i) for i in range(int(dataset.testDataA.shape[0])) ] entitiesB = [ get_entity_name(dataset.testDataB, i) for i in range(int(dataset.testDataB.shape[0])) ] result_prob = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), p) for (a, b, p) in result_prob ] true_links = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in dataset.true_test_links ] export_result_prob(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, entitiesB) distance_distribution = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))), [str("%.2f" % (float(w))) for w in dd], 1 - d) for (e1, e2, dd, d) in distance_distribution if (e1, e2) in result ] export_human_readable_results(Census, 'veg', 'census', 'rltranse', entitiesA, distance_distribution, entitiesB) result = [ (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))), entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b)))) for (a, b) in result ] export_false_negatives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) export_false_positives(Census, 'veg', 'census', 'rltranse', entitiesA, result_prob, true_links, result, entitiesB) return (max_fscore, precison_at_1)
def test_veer(self): logger = get_logger('RL.Test.VEER.Census') dataset = Census() #Columns of interest for Sant Feliu town columns = [ 'Noms_harmo', 'cognom_1', 'cohort', 'estat_civil', 'parentesc_har', 'ocupacio_hisco' ] params = { 'learning_rate': 0.1, 'margin': 0.1, 'dimension': 32, 'epochs': 50, 'regularizer_scale': 0.1, 'batchSize': 512 } veer = VEER(Census, columns, dimension=params['dimension'], learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) #Train Model loss, val_loss = veer.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = veer.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(veer.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0])) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = ir_metrics.log_metrics(logger, params) #Export embeddings embeddings = veer.get_val_embeddings() export_embeddings('veg', 'census', 'veer', veer.values, embeddings) #Write Result Prob to file result_feature_mapping = [(e1, e2, [ str( abs( spatial.distance.cosine( embeddings[veer.values.index( veer._clean(dataset.testDataA.loc[e1][c]))], embeddings[veer.values.index( veer._clean(dataset.testDataB.loc[e2][c]))]))) for c in columns ], d) for (e1, e2, d) in result_prob if (e1, e2) in result] entitiesA = dataset.get_entity_names(dataset.testDataA) entitiesB = dataset.get_entity_names(dataset.testDataB) index_dictA = { str(dataset.testDataA.iloc[i]._name): i for i in range(dataset.testDataA.shape[0]) } index_dictB = { str(dataset.testDataB.iloc[i]._name): i for i in range(dataset.testDataB.shape[0]) } result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p) for (a, b, p) in result_prob] export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, entitiesB) export_false_negatives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) export_false_positives(Census, 'veg', str(dataset), 'VEER', entitiesA, result_prob, dataset.true_test_links, result, entitiesB) result_feature_mapping = [(index_dictA[str(a)], index_dictB[str(b)], w, p) for (a, b, w, p) in result_feature_mapping] export_human_readable_results(Census, 'veg', str(dataset), 'VEER', entitiesA, result_feature_mapping, entitiesB) veer.close_tf_session()
def _test_werl(self, model, columns, params): #Load Graph Data dataset = model() logger = get_logger('RL.Test.WERL.' + str(dataset)) ea_params = self.get_optimal_ea_params(model, params['ea_method']) if params['ea_method'] in [TransE, TransH]: #ER methods graph = Graph_ER(model) #Train TransE embedding vectors transe = params['ea_method']( graph, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize'], neg_rate=ea_params['neg_rate'], neg_rel_rate=ea_params['neg_rel_rate']) try: #raise Exception("Reset") transe.restore_model( self._get_tf_model_filename(dataset, transe)) except Exception as e: logger.error(e) loss = transe.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f", loss) transe.save_model(self._get_tf_model_filename(dataset, transe)) ent_embeddings = transe.get_ent_embeddings() rel_embeddings = None entity = graph.entity transe.close_tf_session() elif params['ea_method'] in [RLTransE]: #VEG methods graph = Graph_VEG(model) #Train TransE embedding vectors rltranse = params['ea_method']( graph, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize'], neg_rate=ea_params['neg_rate'], neg_rel_rate=ea_params['neg_rel_rate']) try: #raise Exception("Reset") rltranse.restore_model( self._get_tf_model_filename(dataset, rltranse)) except Exception as e: logger.error(e) loss, val_loss = rltranse.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f", loss) rltranse.save_model( self._get_tf_model_filename(dataset, rltranse)) val_embeddings = rltranse.get_val_embeddings() rel_embeddings = rltranse.get_rel_embeddings() if model == Census: #hack: census veg graph has 8 relations. we need only 6 #removing same_as and surname2 embedding. rel_embeddings = np.append(rel_embeddings[1:3], rel_embeddings[4:], axis=0) ent_embeddings = [] entity = [] for rel in val_embeddings: val_count = len(graph.relation_value_map[rel]) entity.extend(graph.relation_value_map[rel]) ent_embeddings.extend(val_embeddings[rel][:val_count]) assert len(ent_embeddings) == len(entity) rltranse.close_tf_session() elif params['ea_method'] in [VEER]: veer = VEER(model, columns, dimension=ea_params['dimension'], learning_rate=ea_params['learning_rate'], margin=ea_params['margin'], regularizer_scale=ea_params['regularizer_scale'], batchSize=ea_params['batchSize']) try: veer.restore_model(self._get_tf_model_filename(dataset, veer)) except Exception as e: logger.error(e) #Train Model loss, val_loss = veer.train(max_epochs=ea_params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) veer.save_model(self._get_tf_model_filename(dataset, veer)) ent_embeddings = veer.get_val_embeddings() rel_embeddings = None entity = veer.get_values() veer.close_tf_session() else: raise Exception("Unknown Entity Alignment method") #Train WERL weights werl = WERL(model, columns, entity, ent_embeddings, rel_embeddings, learning_rate=params['learning_rate'], margin=params['margin'], regularizer_scale=params['regularizer_scale'], batchSize=params['batchSize']) loss, val_loss = werl.train(max_epochs=params['epochs']) logger.info("Training Complete with loss: %f, val_loss:%f", loss, val_loss) #Test Model result_prob, accuracy = werl.test() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Test Model logger = get_logger('RL.Test.MERL.' + str(dataset)) result_prob, accuracy = werl.test_merl() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) precison_at_1 = None #ir_metrics.log_metrics(logger, params) #Test Without Weights = Mean Emebedding for Record Linkage logger = get_logger('RL.Test.NoWT.' + str(dataset)) result_prob, accuracy = werl.test_without_weight() logger.info("Predict count: %d", len(result_prob)) logger.info( "Sample Prob: %s", str([(c, (a, b) in dataset.true_test_links) for (a, b, c) in result_prob[:20]])) logger.info("Column Weights: %s", str(werl.get_col_weights())) logger.info("Accuracy: %s", str(accuracy)) #Compute Performance measures optimal_threshold, nowt_max_fscore = get_optimal_threshold( result_prob, dataset.true_test_links, max_threshold=2.0) try: params['threshold'] = optimal_threshold result = pd.MultiIndex.from_tuples([(e1, e2) for (e1, e2, d) in result_prob if d <= optimal_threshold]) log_quality_results(logger, result, dataset.true_test_links, len(dataset.test_links), params) except Exception as e: logger.info("Zero Reults") logger.error(e) #Log MAP, MRR and Hits@K #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links) #nowt_precison_at_1 = ir_metrics.log_metrics(logger, params) werl.close_tf_session() return (max_fscore, precison_at_1)