def test_census(self):
        logger = get_logger('RL.Test.KmeansClustering.CENSUS')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier(algorithm='full',
                                               max_iter=1000,
                                               random_state=42)
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))
    def test_febrl(self):
        logger = get_logger('RL.Test.KmeansClustering.FEBRL')

        febrl = FEBRL()

        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.candidate_links, febrl.trainDataA,
                                      febrl.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_links,
                            len(febrl.candidate_links))

        #Test the classifier
        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.test_links, febrl.testDataA,
                                      febrl.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_test_links,
                            len(febrl.test_links))
    def test_cora(self):
        logger = get_logger('RL.Test.KmeansClustering.CORA')

        #Read Train data in dataset A & B
        cora = Cora()

        ## Extarct Features
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.candidate_links, cora.trainDataA,
                                      cora.trainDataB)
        logger.info("Features %s", str(features.describe()))

        # Train K-Means Classifier
        logrg = recordlinkage.KMeansClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_links,
                            len(cora.candidate_links))

        #Test the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.test_links, cora.testDataA,
                                      cora.testDataB)
        logger.info("Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_test_links,
                            len(cora.test_links))
    def _test_transh(self, dataset, params):
        graph = Graph_ER(dataset)
        model = dataset()
        logger = get_logger('RL.Test.er.TransH.' + str(model))

        transh = TransH(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transh.get_ent_embeddings()
        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings[graph.entity_pairs[i][0]],
                    ent_embeddings[graph.entity_pairs[i][1]]))
            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs)

        #Write Embeddings to file
        export_embeddings('er', str(model), 'TransH', graph.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity,
                           result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)

        try:
            logger.info("MAX FSCORE: %f AT : %f", max_fscore,
                        optimal_threshold)
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            params['threshold'] = optimal_threshold
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
            export_false_negatives(dataset, 'er', str(model), 'TransH',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result, graph.entity)
            export_false_positives(dataset, 'er', str(model), 'TransH',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result, graph.entity)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        p_at_1 = ir_metrics.log_metrics(logger, params)

        transh.close_tf_session()
        return (max_fscore, p_at_1)
Exemple #5
0
    def _test_rl_transe(self, dataset, params):
        #Load Graph Data
        graph = Graph_ER(dataset)
        model = dataset()
        logger = get_logger('RL.Test.er.RLTransE.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        result_prob = []
        for (a, b) in graph.entity_pairs:
            a_triples = [(h, t, r) for (h, t, r) in graph.triples if h == a]
            b_triples = [(h, t, r) for (h, t, r) in graph.triples if h == b]

            distance = abs(
                spatial.distance.cosine(ent_embeddings[a], ent_embeddings[b]))
            for (ah, at, ar) in a_triples:
                bt = [t for (h, t, r) in b_triples if r == ar]
                if len(bt):
                    distance = distance + abs(spatial.distance.cosine(\
                                            ent_embeddings[at], ent_embeddings[bt[0]]))
            result_prob.append((a, b, distance))
            #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a, b) in graph.true_pairs)

        #Write Embeddings to file
        export_embeddings('er', str(model), 'RLTransE', graph.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'er', str(model), 'RLTransE', graph.entity,
                           result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs, max_threshold=3.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return (max_fscore, precison_at_1)
Exemple #6
0
    def _test_etranse(self, dataset, params):
        model = dataset()
        graph = Graph_ERER(dataset)
        logger = get_logger("RL.Test.erer.ETransE." + str(model))

        etranse = ETransE(graph,
                          dimension=params['dimension'],
                          batchSize=params['batchSize'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'],
                          regularizer_scale=params['regularizer_scale'],
                          alpha=params['alpha'],
                          beta=params['beta'])
        etranse.train(max_epochs=params['max_epochs'])
        ent_embeddings_a = etranse.get_ent_embeddings_A()
        ent_embeddings_b = etranse.get_ent_embeddings_B()

        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings_a[int(graph.entity_pairs[i][0])],
                    ent_embeddings_b[int(graph.entity_pairs[i][1])]))
            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs)

        #Write Embeddings to file
        export_embeddings('erer', str(model), 'ETransE', graph.entityA,
                          ent_embeddings_a)
        export_embeddings('erer', str(model), 'ETransE', graph.entityB,
                          ent_embeddings_b)
        export_result_prob(dataset, 'erer', str(model), 'ETransE',
                           graph.entityA, result_prob, graph.true_pairs,
                           graph.entityB)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        prec_at_1 = ir_metrics.log_metrics(logger, params)

        etranse.close_tf_session()
        return (max_fscore, prec_at_1)
Exemple #7
0
    def _test_erer(self, dataset, er_algo, params):
        model = dataset()
        graph = Graph_ERER(dataset)
        graph_er = graph.get_er_model()

        er_model = er_algo(graph_er,
                           dimension=params['dimension'],
                           learning_rate=params['learning_rate'],
                           margin=params['margin'],
                           regularizer_scale=params['regularizer_scale'],
                           batchSize=params['batchSize'],
                           neg_rate=params['neg_rate'],
                           neg_rel_rate=params['neg_rel_rate'])
        loss = er_model.train(max_epochs=params['epochs'])

        logger = get_logger('RL.Test.erer.ERER.' + str(model) + "." +
                            str(er_model))
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = er_model.get_ent_embeddings()
        result_prob = []
        for i in range(0, len(graph_er.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings[graph_er.entity_pairs[i][0]],
                    ent_embeddings[graph_er.entity_pairs[i][1]]))
            result_prob.append((graph_er.entity_pairs[i][0],
                                graph_er.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph_er.entity_pairs[i] in graph_er.true_pairs)

        #Write Embeddings to file
        export_embeddings("erer", str(model), str(er_model), graph_er.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'erer', str(model), str(er_model),
                           graph_er.entity, result_prob, graph_er.true_pairs)

        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph_er.true_pairs)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph_er.true_pairs,
                                len(graph_er.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 graph_er.true_pairs)
        ir_metrics.log_metrics(logger)

        er_model.close_tf_session()
        return max_fscore
Exemple #8
0
    def _test_logistic_transh(self, dataset, params):
        """Note: Zero aligned pairs are returned, require fixation."""
        model = dataset()
        logger = get_logger('RL.Test.LogisticTransH.' + str(model))
        entity, relation, triples, entity_pairs, true_pairs = model.get_er_model(
        )
        transh = TransH(entity,
                        relation,
                        triples,
                        entity_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transh.get_ent_embeddings()
        ent_embeddings = [
            np.array(ent_embeddings[i]) for i in range(ent_embeddings.shape[0])
        ]
        trainDataA = pd.DataFrame(data=ent_embeddings)
        trainDataB = pd.DataFrame(data=ent_embeddings)

        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i), method='gauss')

        candidate_links = pd.MultiIndex.from_tuples(entity_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Features %s", str(features.describe()))

        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, true_pairs)

        result = logrg.predict(features)
        log_quality_results(logger, result, true_pairs, len(entity_pairs))

        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs)
        ir_metrics.log_metrics(logger, params)
    def _test_seea(self, dataset, params):
        model = dataset()
        graph = Graph_EAR(dataset)
        logger = get_logger('RL.Test.ear.SEEA.' + str(model))

        seea = SEEA(graph,
                    dimension=params['dimension'],
                    learning_rate=params['learning_rate'],
                    batchSize=params['batchSize'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    neg_rate=params['neg_rate'],
                    neg_rel_rate=params['neg_rel_rate'])

        #Begin SEEA iterations, passing true pairs only to debug the alignments.
        results = seea.seea_iterate(beta=params['beta'],
                                    max_iter=params['max_iter'],
                                    max_epochs=params['max_epochs'])
        try:
            result_pairs = pd.MultiIndex.from_tuples(results)
            fscore = log_quality_results(logger, result_pairs,
                                         graph.true_pairs,
                                         len(graph.entity_pairs), params)
        except Exception as e:
            logger.error(e)
            logger.info("No Aligned pairs found.")

        ent_embeddings = seea.get_ent_embeddings()
        export_embeddings('ear', str(model), 'SEEA', graph.entity,
                          ent_embeddings)

        result_prob = []
        for (e1, e2) in graph.entity_pairs:
            distance = abs(
                spatial.distance.cosine(ent_embeddings[e1],
                                        ent_embeddings[e2]))
            result_prob.append((e1, e2, distance))
        export_result_prob(dataset, 'ear', str(model), 'SEEA', graph.entity,
                           result_prob, graph.true_pairs)

        try:
            export_false_negatives(dataset, 'ear', str(model), 'SEEA',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result_pairs, graph.entity)
            export_false_positives(dataset, 'ear', str(model), 'SEEA',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result_pairs, graph.entity)
        except Exception as e:
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        prec_at_1 = ir_metrics.log_metrics(logger, params)

        seea.close_tf_session()
        return (fscore, prec_at_1)
    def test_cora(self):
        logger = get_logger('RL.Test.ECMClassifier.CORA')

        #Read Train data in dataset A & B
        cora = Cora()

        ## Extarct Features
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.candidate_links, cora.trainDataA,
                                      cora.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.ECMClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_links,
                            len(cora.candidate_links))

        #validate the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.val_links, cora.valDataA,
                                      cora.valDataB)
        logger.info("Validation Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_val_links,
                            len(cora.val_links))

        #Test the classifier
        compare_cl = cora.get_comparision_object()
        features = compare_cl.compute(cora.test_links, cora.testDataA,
                                      cora.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, cora.true_test_links,
                            len(cora.test_links))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(cora.test_links[i][0], cora.test_links[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 cora.true_test_links)
        ir_metrics.log_metrics(logger)
Exemple #11
0
    def test_febrl(self):
        logger = get_logger('RL.Test.LogisticRegression.FEBRL')

        febrl = FEBRL()

        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.candidate_links, febrl.trainDataA,
                                      febrl.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, febrl.true_links)

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_links,
                            len(febrl.candidate_links))

        #Validate the classifier
        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.val_links, febrl.valDataA,
                                      febrl.valDataB)
        logger.info("Validation Features %s", str(features.describe()))
        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_val_links,
                            len(febrl.val_links))

        #Test the classifier
        compare_cl = febrl.get_comparision_object()
        features = compare_cl.compute(febrl.test_links, febrl.testDataA,
                                      febrl.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, febrl.true_test_links,
                            len(febrl.test_links))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(febrl.test_links[i][0], febrl.test_links[i][1],
                        prob[i]) for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 febrl.true_test_links)
        ir_metrics.log_metrics(logger)
    def test_febrl(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(FEBRL)
        model = FEBRL()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        postcode_rel_id = graph.relation.index("postcode")
        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            person_A = graph.entity_pairs[i][0]
            person_B = graph.entity_pairs[i][1]

            postcode_A = [
                t for (h, t, r) in graph.triples
                if h == person_A and r == postcode_rel_id
            ][0]
            neighbours_A = [
                h for (h, t, r) in graph.triples if t == postcode_A
            ]
            #logger.info("FM A: %s", str([graph.entity[a] for a in neighbours_A]))
            postcode_B = [
                t for (h, t, r) in graph.triples
                if h == person_B and r == postcode_rel_id
            ][0]
            neighbours_B = [
                h for (h, t, r) in graph.triples if t == postcode_B
            ]
            #logger.info("FM B: %s", str([graph.entity[a] for a in neighbours_B]))

            cost_matrix = np.zeros(shape=(len(neighbours_A),
                                          len(neighbours_B)))
            for i in range(len(neighbours_A)):
                for j in range(len(neighbours_B)):
                    if neighbours_A[i] == neighbours_B[j]:
                        cost_matrix[i][j] = 100
                    else:
                        cost_matrix[i][j] = abs(
                            spatial.distance.cosine(
                                ent_embeddings[neighbours_A[i]],
                                ent_embeddings[neighbours_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())

            person_A_index = neighbours_A.index(person_A)
            person_B_index = neighbours_B.index(person_B)
            distance = cost_matrix[row_ind, col_ind].sum(
            ) + cost_matrix[person_A_index][person_B_index]
            #import ipdb;ipdb.set_trace()
            #if (person_A_index, person_B_index) not in (row_ind, col_ind):
            #   distance = distance + 1000

            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))

        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(FEBRL, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
    def test_census(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(Census)
        model = Census()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        result_prob = []
        for ep_index in range(0, len(graph.entity_pairs)):
            #logger.info("Computing cost for: %s", str([graph.entity[e] for e in graph.entity_pairs[ep_index]]))
            household_A = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][0] and r > 6
            ][0]
            family_members_A = [
                h for (h, t, r) in graph.triples if t == household_A
            ]
            #logger.info("FM A: %s", str([graph.entity[a] for a in family_members_A]))
            household_B = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][1] and r > 6
            ][0]
            family_members_B = [
                h for (h, t, r) in graph.triples if t == household_B
            ]
            #logger.info("FM B: %s", str([graph.entity[a] for a in family_members_B]))

            cost_matrix = np.zeros(shape=(len(family_members_A),
                                          len(family_members_B)))
            for i in range(len(family_members_A)):
                for j in range(len(family_members_B)):
                    #if family_members_A[i] == family_members_B[j]:
                    #    cost_matrix[i][j] = 100
                    #else:
                    cost_matrix[i][j] = abs(
                        spatial.distance.cosine(
                            ent_embeddings[family_members_A[i]],
                            ent_embeddings[family_members_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())
            #logger.info("Rows selected %s, Col selected: %s", str(row_ind), str(col_ind))

            eA_index = family_members_A.index(graph.entity_pairs[ep_index][0])
            eB_index = family_members_B.index(graph.entity_pairs[ep_index][1])
            #logger.info("A index: %d, B index: %d", eA_index, eB_index)

            rowA = np.where(row_ind == eA_index)[0]
            if len(rowA) and col_ind[rowA[0]] == eB_index:
                #logger.info("Pair in min. cost matrix")
                distance = cost_matrix[row_ind, col_ind].sum()
            else:
                distance = cost_matrix[row_ind, col_ind].sum() + abs(
                    spatial.distance.cosine(
                        ent_embeddings[graph.entity_pairs[ep_index][0]],
                        ent_embeddings[graph.entity_pairs[ep_index][1]]))

            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            if ep_index % 1000 == 0:
                logger.info("i: %d, distance: %f true_pairs: %s", ep_index,
                            distance, graph.entity_pairs[ep_index]
                            in graph.true_pairs)
            #if graph.entity_pairs[ep_index] in graph.true_pairs:
            #    import ipdb;ipdb.set_trace()
        #Normalize distance
        max_distance = 10
        #for r in result_prob:
        #    if r[2] > max_distance:
        #        max_distance = r[2]
        result_prob = [(r[0], r[1], (r[2] / max_distance))
                       for r in result_prob]
        #logger.info("Max distance: %f", max_distance)

        for r in result_prob[:100]:
            logger.info("distance: %f true_pairs: %s", r[2], (r[0], r[1])
                        in graph.true_pairs)
        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(Census, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
Exemple #14
0
    def test_census_new(self):
        c = Census()
        graph = Graph_VEG(Census)
        logger = get_logger("RL.Test.LogisticRLTransE.Census")
        logger.info("values for name : %s",
                    str(graph.relation_value_map[graph.relation[1]][:10]))
        logger.info("relation: %s", str(graph.relation))
        logger.info("train_triples: %s", str(graph.train_triples[:10]))
        logger.info("set train_triples size %d", len(set(graph.train_triples)))

        params = self.get_default_params()
        transe = RLTransE(graph,
                          dimension=params['dimension'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          regularizer_scale=params['regularizer_scale'],
                          batchSize=params['batchSize'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'])
        loss, val_loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f val_loss: %f", loss,
                    val_loss)

        value_embeddings = transe.get_val_embeddings()
        relation_embeddings = transe.get_rel_embeddings()

        #Map of feilds in census dataFrame to VEG relations.
        field_relation_map = {
            c.field_map[CensusFields.FIRST_NAME]: "name",
            c.field_map[CensusFields.SURNAME_1]: "surname",
            c.field_map[CensusFields.SURNAME_2]: "surname2",
            c.field_map[CensusFields.YOB]: "yob",
            c.field_map[CensusFields.CIVIL_STATUS]: "civil",
            c.field_map[CensusFields.OCCUPATION]: "occupation",
            c.field_map[CensusFields.RELATION]: "relation"
        }

        missing_values = []
        train_features = []  #Size samples*(dimension*rel_count)
        test_features = []
        for (candidate_links, dataA, dataB, features) in \
                            [(c.candidate_links, c.trainDataA, c.trainDataB, train_features),
                            (c.test_links, c.testDataA, c.testDataB, test_features)]:
            for (a, b) in candidate_links:
                row_a = dataA.loc[a]
                row_b = dataB.loc[b]
                distance = []

                for f in field_relation_map:
                    val_a = row_a[f]
                    val_b = row_b[f]
                    if val_a != val_b:
                        rel = field_relation_map[f]
                        try:
                            val_index_a = graph.relation_value_map[rel].index(
                                val_a)
                        except ValueError:
                            missing_values.append(val_a)
                            distance.extend([1.0] * params['dimension'])
                            continue
                        try:
                            val_index_b = graph.relation_value_map[rel].index(
                                val_b)
                        except ValueError:
                            missing_values.append(val_b)
                            distance.extend([1.0] * params['dimension'])
                            continue
                        rel_index = graph.relation.index(field_relation_map[f])

                        distance.extend(value_embeddings[rel][val_index_a] + \
                            relation_embeddings[rel_index] - value_embeddings[rel][val_index_b])

                features.append(pd.Series(distance).rename((a, b)))
                #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in c.true_test_links)
        logger.info("No. of missing values: %d", len(missing_values))
        logger.info("Unique No. of missing values: %d",
                    len(set(missing_values)))

        train_features = pd.DataFrame(data=train_features).fillna(1)
        test_features = pd.DataFrame(data=test_features).fillna(1)
        logger.info("Shape of Train features: %s", str(train_features.shape))
        logger.info("Shape of Test features: %s", str(test_features.shape))

        #Train Logistic Regression Model
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(train_features, c.true_links)
        result = logrg.predict(train_features)
        result = pd.MultiIndex.from_tuples(result.to_series())
        log_quality_results(logger, result, c.true_links,
                            len(c.candidate_links), params)

        #Test Classifier
        result = logrg.predict(test_features)
        result = pd.MultiIndex.from_tuples(result.to_series())
        log_quality_results(logger, result, c.true_test_links,
                            len(c.test_links), params)
        """
        Todo: Export Embeddings and probabilities.
        try:
            entities = ["value\trelation"]
            for r in graph.relation_value_map:
                for v in graph.relation_value_map[r]:
                    entities.append("\t".join([v,r]))

            embeddings = []
            for rel in value_embeddings:
                val_count = len(graph.relation_value_map[rel])
                embeddings.extend(value_embeddings[rel][:val_count])

            #Write Embeddings to file
            export_embeddings('veg', str(c), 'LogisticRLTransE', entities, embeddings)
        except Exception as e:
            logger.error("Failed to export embeddings")
            logger.error(e)
        export_result_prob(Census, 'veg', str(c), 'RLTransE', graph.values, result_prob, c.true_test_links)
        """
        prob_series = logrg.prob(test_features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(c.test_links[i][0], c.test_links[i][1], prob[i])
                       for i in range(0, len(prob))]
        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 c.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
Exemple #15
0
    def _test_rl_transe(self, model, field_relation_map, params):
        dataset = model()
        graph = Graph_VEG(model)
        logger = get_logger("RL.Test.RLTransE." + str(dataset))
        logger.info("values for name : %s",
                    str(graph.relation_value_map[graph.relation[1]][:10]))
        logger.info("relation: %s", str(graph.relation))
        logger.info("train_triples: %s", str(graph.train_triples[:10]))
        logger.info("set train_triples size %d", len(set(graph.train_triples)))

        transe = RLTransE(graph,
                          dimension=params['dimension'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          regularizer_scale=params['regularizer_scale'],
                          batchSize=params['batchSize'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'])
        loss, val_loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f val_loss: %f", loss,
                    val_loss)

        value_embeddings = transe.get_val_embeddings()
        relation_embeddings = transe.get_rel_embeddings()

        result_prob = []
        distance_distribution = []
        missing_values = []
        for (a, b) in dataset.test_links:
            row_a = dataset.testDataA.loc[a]
            row_b = dataset.testDataB.loc[b]

            distance = 0
            dd = []
            for f in field_relation_map:
                val_a = row_a[f]
                val_b = row_b[f]
                if val_a == val_b:
                    dd.append(0)
                else:
                    rel = field_relation_map[f]
                    try:
                        val_index_a = graph.relation_value_map[rel].index(
                            val_a)
                    except ValueError:
                        missing_values.append(val_a)
                        distance = distance + 1
                        dd.append(1)
                        continue
                    try:
                        val_index_b = graph.relation_value_map[rel].index(
                            val_b)
                    except ValueError:
                        missing_values.append(val_b)
                        distance = distance + 1
                        dd.append(1)
                        continue
                    rel_index = graph.relation.index(field_relation_map[f])

                    cur_distance = abs(
                        spatial.distance.cosine(
                            value_embeddings[rel][val_index_a] +
                            relation_embeddings[rel_index],
                            value_embeddings[rel][val_index_b]))
                    distance = distance + cur_distance
                    dd.append(cur_distance)

            result_prob.append((a, b, distance))
            distance_distribution.append((a, b, dd, distance))
            #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in dataset.true_test_links)
        logger.info("No. of missing values: %d", len(missing_values))
        logger.info("Unique No. of missing values: %d",
                    len(set(missing_values)))
        try:
            entities = ["value\trelation"]
            for r in graph.relation_value_map:
                for v in graph.relation_value_map[r]:
                    entities.append("\t".join([v, r]))

            embeddings = []
            for rel in value_embeddings:
                val_count = len(graph.relation_value_map[rel])
                embeddings.extend(value_embeddings[rel][:val_count])

            #Write Embeddings to file
            export_embeddings('veg', str(dataset), 'RLTransE_val', entities,
                              embeddings)
            export_embeddings('veg', str(dataset), 'RLTransE_rel',
                              graph.relation, relation_embeddings)
        except Exception as e:
            logger.error("Failed to export embeddings")
            logger.error(e)

        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=3.0, step=0.02)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()

        #Export False Positives and result porobabilities
        get_entity_name = lambda d, i: "_".join([
            str(d.iloc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][dataset.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda d, i: "_".join([
            str(d.loc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][dataset.field_map[CensusFields.DNI]])
        ])
        entitiesA = [
            get_entity_name(dataset.testDataA, i)
            for i in range(int(dataset.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(dataset.testDataB, i)
            for i in range(int(dataset.testDataB.shape[0]))
        ]
        result_prob = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB,
                                                 int(b))), p)
            for (a, b, p) in result_prob
        ]
        true_links = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))))
            for (a, b) in dataset.true_test_links
        ]
        export_result_prob(Census, 'veg', 'census', 'rltranse', entitiesA,
                           result_prob, true_links, entitiesB)

        distance_distribution = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))),
             [str("%.2f" % (float(w))) for w in dd], 1 - d)
            for (e1, e2, dd, d) in distance_distribution if (e1, e2) in result
        ]
        export_human_readable_results(Census, 'veg', 'census', 'rltranse',
                                      entitiesA, distance_distribution,
                                      entitiesB)

        result = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))))
            for (a, b) in result
        ]
        export_false_negatives(Census, 'veg', 'census', 'rltranse', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(Census, 'veg', 'census', 'rltranse', entitiesA,
                               result_prob, true_links, result, entitiesB)

        return (max_fscore, precison_at_1)
    def test_ecm(self):
        logger = get_logger('RL.Test.ECMClassifier.Census')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.ECMClassifier()
        logrg.fit(features)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Validate the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.val_links, census.valDataA,
                                      census.valDataB)
        logger.info("Validation Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_val_links,
                            len(census.val_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))

        logger.info("ECM weights: %s", str(logrg.weights))

        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(census.test_links[i][0], census.test_links[i][1],
                        prob[i]) for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 census.true_test_links)
        ir_metrics.log_metrics(logger)

        #Export False Positives and result porobabilities
        result_feature_mapping = [
            (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        get_entity_name = lambda c, d, i: "_".join([
            str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][c.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda c, d, i: "_".join([
            str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][c.field_map[CensusFields.DNI]])
        ])
        start_time = timeit.default_timer()
        entitiesA = [
            get_entity_name(census, census.testDataA, i)
            for i in range(int(census.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(census, census.testDataB, i)
            for i in range(int(census.testDataB.shape[0]))
        ]
        logger.info("Entities built in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        result_prob = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                        entitiesB.index(
                            get_entity_name_loc(census, census.testDataB,
                                                int(b))), p)
                       for (a, b, p) in result_prob]
        logger.info("Result prob in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        true_links = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                       entitiesB.index(
                           get_entity_name_loc(census, census.testDataB,
                                               int(b))))
                      for (a, b) in census.true_test_links]
        logger.info("true_links in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        export_result_prob(Census, 'ECM', 'census', 'ecm', entitiesA,
                           result_prob, true_links, entitiesB)
        logger.info("Result prob EXPORTED in %s",
                    str(timeit.default_timer() - start_time))

        start_time = timeit.default_timer()
        result = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                   entitiesB.index(
                       get_entity_name_loc(census, census.testDataB, int(b))))
                  for (a, b) in result]
        export_false_negatives(Census, 'ECM', 'census', 'ecm', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(Census, 'ECM', 'census', 'ecm', entitiesA,
                               result_prob, true_links, result, entitiesB)
        logger.info("FP & FN EXPORTED in %s",
                    str(timeit.default_timer() - start_time))

        result_feature_mapping = [
            (entitiesA.index(
                get_entity_name_loc(census, census.testDataA, int(a))),
             entitiesB.index(
                 get_entity_name_loc(census, census.testDataB, int(b))), w, p)
            for (a, b, w, p) in result_feature_mapping
        ]
        export_human_readable_results(Census, 'ECM', 'census', 'ecm',
                                      entitiesA, result_feature_mapping,
                                      entitiesB)
        logger.info("Exported Human Readable Results")
Exemple #17
0
    def _test_logistic_transh_erer(self, dataset, params):
        model = dataset()
        logger = get_logger('RL.Test.erer.LogisticTransH.ERER.' + str(model))
        entA, entB, relA, relB, triA, triB, entity_pairs, prior_pairs, true_pairs = model.get_erer_model(
        )

        self.assertTrue(all([(tp in entity_pairs) for tp in true_pairs]))
        #Generate embeddings for datasetA
        transh = TransH(entA,
                        relA,
                        triA,
                        prior_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)
        ent_embeddingsA = transh.get_ent_embeddings()
        transh.close_tf_session()
        del transh

        #Generate embeddings for datasetB
        transh = TransH(entB,
                        relB,
                        triB,
                        entity_pairs,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)
        ent_embeddingsB = transh.get_ent_embeddings()
        transh.close_tf_session()

        ent_embeddingsA = [
            np.array(ent_embeddingsA[i])
            for i in range(ent_embeddingsA.shape[0])
        ]
        ent_embeddingsB = [
            np.array(ent_embeddingsB[i])
            for i in range(ent_embeddingsB.shape[0])
        ]
        trainDataA = pd.DataFrame(data=ent_embeddingsA)
        trainDataB = pd.DataFrame(data=ent_embeddingsB)

        #Define comparision Class
        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i))  #method='exp')

        #sample negative pairs
        train_pairs = []
        tuple_pp = set(map(tuple, prior_pairs))
        logger.info("Number of prior_pairs: %d", len(prior_pairs))
        for e1, e2 in prior_pairs:
            train_pairs.append((e1, e2))
            while True:
                neg_e2 = random.choice(xrange(0, len(entB)))
                if neg_e2 == e2 or (e1, neg_e2) in tuple_pp:
                    continue
                else:
                    train_pairs.append((e1, neg_e2))
                    break
        logger.info("Number of Train Pairs: %d", len(train_pairs))
        candidate_links = pd.MultiIndex.from_tuples(train_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        #Train Logistic Regression Model
        logrg = recordlinkage.LogisticRegressionClassifier()
        candidate_links = pd.MultiIndex.from_tuples(prior_pairs)
        logrg.fit(features, candidate_links)

        #Test Classifier
        compare_cl = recordlinkage.Compare()
        for i in range(0, params['dimension']):
            compare_cl.numeric(i, i, label=str(i))
        candidate_links = pd.MultiIndex.from_tuples(entity_pairs)
        features = compare_cl.compute(candidate_links, trainDataA, trainDataB)
        logger.info("Test Features %s", str(features.describe()))
        result = logrg.predict(features)
        log_quality_results(logger, result, true_pairs, len(entity_pairs))

        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(entity_pairs[i][0], entity_pairs[i][1], prob[i])
                       for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob, true_pairs)
        ir_metrics.log_metrics(logger, params, params)

        #Export results
        export_embeddings('erer', str(model), 'LogTransH', entA,
                          ent_embeddingsA)
        export_embeddings('erer', str(model), 'LogTransH', entB,
                          ent_embeddingsB)
        export_result_prob(dataset, 'erer', str(model), 'LogTransH', entA,
                           result_prob, true_pairs, entB)
    def test_logistic(self):
        logger = get_logger('RL.Test.LogisticRegression.Census')

        census = Census()

        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.candidate_links,
                                      census.trainDataA, census.trainDataB)
        logger.info("Train Features %s", str(features.describe()))

        # Train ECM Classifier
        logrg = recordlinkage.LogisticRegressionClassifier()
        logrg.fit(features, census.true_links)

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_links,
                            len(census.candidate_links))

        #Validate the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.val_links, census.valDataA,
                                      census.valDataB)
        logger.info("Validation Features %s", str(features.describe()))
        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_val_links,
                            len(census.val_links))

        #Test the classifier
        compare_cl = census.get_comparision_object()
        features = compare_cl.compute(census.test_links, census.testDataA,
                                      census.testDataB)
        logger.info("Test Features %s", str(features.describe()))

        result = logrg.predict(features)
        log_quality_results(logger, result, census.true_test_links,
                            len(census.test_links))

        logger.info("logrg coefficients: %s", str(logrg.coefficients))
        #Log IR Stats: MRR, MAP, MP@K
        prob_series = logrg.prob(features)
        prob = [(1 - p) for p in prob_series.tolist()]
        result_prob = [(census.test_links[i][0], census.test_links[i][1],
                        prob[i]) for i in range(0, len(prob))]
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 census.true_test_links)
        ir_metrics.log_metrics(logger)

        #Export False Positives and result porobabilities
        result_feature_mapping = [
            (e1, e2, [str(v) for v in features.loc[(e1, e2)].values], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        get_entity_name = lambda c, d, i: "_".join([
            str(d.iloc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][c.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda c, d, i: "_".join([
            str(d.loc[i][c.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][c.field_map[CensusFields.DNI]])
        ])
        entitiesA = [
            get_entity_name(census, census.testDataA, i)
            for i in range(int(census.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(census, census.testDataB, i)
            for i in range(int(census.testDataB.shape[0]))
        ]
        result_prob = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                        entitiesB.index(
                            get_entity_name_loc(census, census.testDataB,
                                                int(b))), p)
                       for (a, b, p) in result_prob]
        true_links = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                       entitiesB.index(
                           get_entity_name_loc(census, census.testDataB,
                                               int(b))))
                      for (a, b) in census.true_test_links]
        export_result_prob(Census, 'LogisticRegression', 'census', 'logistic',
                           entitiesA, result_prob, true_links, entitiesB)

        result = [(entitiesA.index(
            get_entity_name_loc(census, census.testDataA, int(a))),
                   entitiesB.index(
                       get_entity_name_loc(census, census.testDataB, int(b))))
                  for (a, b) in result]
        export_false_negatives(Census, 'LogisticRegression', 'census',
                               'logistic', entitiesA, result_prob, true_links,
                               result, entitiesB)
        export_false_positives(Census, 'LogisticRegression', 'census',
                               'logistic', entitiesA, result_prob, true_links,
                               result, entitiesB)

        weights = logrg.coefficients
        result = [
            (e1, e2,
             [str("%.2f" % (float(d * w) / sum(weights))) for w in weights], d)
            for (e1, e2, d) in result_prob if (e1, e2) in result
        ]

        result_feature_mapping = [
            (entitiesA.index(
                get_entity_name_loc(census, census.testDataA, int(a))),
             entitiesB.index(
                 get_entity_name_loc(census, census.testDataB, int(b))), w, p)
            for (a, b, w, p) in result_feature_mapping
        ]
        export_human_readable_results(Census, 'LogisticRegression', 'census',
                                      'logistic', entitiesA,
                                      result_feature_mapping, entitiesB)
    def test_cora(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(Cora)
        model = Cora()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        auth_rel_index = graph.relation.index('author')
        result_prob = []
        for ep_index in range(0, len(graph.entity_pairs)):
            authors_A = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][0] and r == auth_rel_index
            ]
            #logger.info("AUHTORS A: %s", str([graph.entity[a] for a in authors_A]))
            authors_B = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][1] and r == auth_rel_index
            ]
            #logger.info("AUHTORS B: %s", str([graph.entity[a] for a in authors_B]))

            cost_matrix = np.zeros(shape=(len(authors_A), len(authors_B)))
            for i in range(len(authors_A)):
                for j in range(len(authors_B)):
                    #if authors_A[i] == authors_B[j]:
                    #    cost_matrix[i][j] = 100
                    #else:
                    cost_matrix[i][j] = abs(
                        spatial.distance.cosine(ent_embeddings[authors_A[i]],
                                                ent_embeddings[authors_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())
            distance = cost_matrix[row_ind, col_ind].sum() + abs(
                spatial.distance.cosine(
                    ent_embeddings[graph.entity_pairs[ep_index][0]],
                    ent_embeddings[graph.entity_pairs[ep_index][1]]))
            result_prob.append((graph.entity_pairs[ep_index][0],
                                graph.entity_pairs[ep_index][1], distance))
            if distance <= 0.05:
                logger.info("i: %d, distance: %f true_pairs: %s", ep_index,
                            distance, graph.entity_pairs[ep_index]
                            in graph.true_pairs)

        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(Cora, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
    def test_veer(self):
        logger = get_logger('RL.Test.VEER.Census')

        dataset = Census()

        #Columns of interest for Sant Feliu town
        columns = [
            'Noms_harmo', 'cognom_1', 'cohort', 'estat_civil', 'parentesc_har',
            'ocupacio_hisco'
        ]
        params = {
            'learning_rate': 0.1,
            'margin': 0.1,
            'dimension': 32,
            'epochs': 50,
            'regularizer_scale': 0.1,
            'batchSize': 512
        }

        veer = VEER(Census,
                    columns,
                    dimension=params['dimension'],
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])

        #Train Model
        loss, val_loss = veer.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = veer.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(veer.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))
        logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0]))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        #Export embeddings
        embeddings = veer.get_val_embeddings()
        export_embeddings('veg', 'census', 'veer', veer.values, embeddings)

        #Write Result Prob to file
        result_feature_mapping = [(e1, e2, [
            str(
                abs(
                    spatial.distance.cosine(
                        embeddings[veer.values.index(
                            veer._clean(dataset.testDataA.loc[e1][c]))],
                        embeddings[veer.values.index(
                            veer._clean(dataset.testDataB.loc[e2][c]))])))
            for c in columns
        ], d) for (e1, e2, d) in result_prob if (e1, e2) in result]

        entitiesA = dataset.get_entity_names(dataset.testDataA)
        entitiesB = dataset.get_entity_names(dataset.testDataB)
        index_dictA = {
            str(dataset.testDataA.iloc[i]._name): i
            for i in range(dataset.testDataA.shape[0])
        }
        index_dictB = {
            str(dataset.testDataB.iloc[i]._name): i
            for i in range(dataset.testDataB.shape[0])
        }
        result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p)
                       for (a, b, p) in result_prob]
        export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA,
                           result_prob, dataset.true_test_links, entitiesB)
        export_false_negatives(Census, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, dataset.true_test_links, result,
                               entitiesB)
        export_false_positives(Census, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, dataset.true_test_links, result,
                               entitiesB)

        result_feature_mapping = [(index_dictA[str(a)], index_dictB[str(b)], w,
                                   p)
                                  for (a, b, w, p) in result_feature_mapping]
        export_human_readable_results(Census, 'veg', str(dataset), 'VEER',
                                      entitiesA, result_feature_mapping,
                                      entitiesB)

        veer.close_tf_session()
Exemple #21
0
    def _test_veer(self, model, columns, params):
        #Load Graph Data
        dataset = model()
        logger = get_logger('RL.Test.VEER.' + str(dataset))

        veer = VEER(model,
                    columns,
                    dimension=params['dimension'],
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])

        #Train Model
        loss, val_loss = veer.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = veer.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(veer.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))
        logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0]))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        #Write Result Prob to file
        entitiesA = dataset.get_entity_names(dataset.testDataA)
        entitiesB = dataset.get_entity_names(dataset.testDataB)
        index_dictA = {
            str(dataset.testDataA.iloc[i]._name): i
            for i in range(dataset.testDataA.shape[0])
        }
        index_dictB = {
            str(dataset.testDataB.iloc[i]._name): i
            for i in range(dataset.testDataB.shape[0])
        }
        result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p)
                       for (a, b, p) in result_prob]
        true_links = [(index_dictA[str(a)], index_dictB[str(b)])
                      for (a, b) in dataset.true_test_links]
        export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA,
                           result_prob, true_links, entitiesB)

        result = [(index_dictA[str(a)], index_dictB[str(b)])
                  for (a, b) in result]
        export_false_negatives(model, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(model, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, true_links, result, entitiesB)

        veer.close_tf_session()
        return (max_fscore, precison_at_1)
Exemple #22
0
    def _test_werl(self, model, columns, params):
        #Load Graph Data
        dataset = model()
        logger = get_logger('RL.Test.WERL.' + str(dataset))
        ea_params = self.get_optimal_ea_params(model, params['ea_method'])
        if params['ea_method'] in [TransE, TransH]:
            #ER methods
            graph = Graph_ER(model)
            #Train TransE embedding vectors
            transe = params['ea_method'](
                graph,
                dimension=ea_params['dimension'],
                learning_rate=ea_params['learning_rate'],
                margin=ea_params['margin'],
                regularizer_scale=ea_params['regularizer_scale'],
                batchSize=ea_params['batchSize'],
                neg_rate=ea_params['neg_rate'],
                neg_rel_rate=ea_params['neg_rel_rate'])
            try:
                #raise Exception("Reset")
                transe.restore_model(
                    self._get_tf_model_filename(dataset, transe))
            except Exception as e:
                logger.error(e)
                loss = transe.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f", loss)
                transe.save_model(self._get_tf_model_filename(dataset, transe))

            ent_embeddings = transe.get_ent_embeddings()
            rel_embeddings = None
            entity = graph.entity
            transe.close_tf_session()
        elif params['ea_method'] in [RLTransE]:
            #VEG methods
            graph = Graph_VEG(model)
            #Train TransE embedding vectors
            rltranse = params['ea_method'](
                graph,
                dimension=ea_params['dimension'],
                learning_rate=ea_params['learning_rate'],
                margin=ea_params['margin'],
                regularizer_scale=ea_params['regularizer_scale'],
                batchSize=ea_params['batchSize'],
                neg_rate=ea_params['neg_rate'],
                neg_rel_rate=ea_params['neg_rel_rate'])

            try:
                #raise Exception("Reset")
                rltranse.restore_model(
                    self._get_tf_model_filename(dataset, rltranse))
            except Exception as e:
                logger.error(e)
                loss, val_loss = rltranse.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f", loss)
                rltranse.save_model(
                    self._get_tf_model_filename(dataset, rltranse))

            val_embeddings = rltranse.get_val_embeddings()
            rel_embeddings = rltranse.get_rel_embeddings()
            if model == Census:
                #hack: census veg graph has 8 relations. we need only 6
                #removing same_as and surname2 embedding.
                rel_embeddings = np.append(rel_embeddings[1:3],
                                           rel_embeddings[4:],
                                           axis=0)
            ent_embeddings = []
            entity = []

            for rel in val_embeddings:
                val_count = len(graph.relation_value_map[rel])
                entity.extend(graph.relation_value_map[rel])
                ent_embeddings.extend(val_embeddings[rel][:val_count])

            assert len(ent_embeddings) == len(entity)

            rltranse.close_tf_session()
        elif params['ea_method'] in [VEER]:
            veer = VEER(model,
                        columns,
                        dimension=ea_params['dimension'],
                        learning_rate=ea_params['learning_rate'],
                        margin=ea_params['margin'],
                        regularizer_scale=ea_params['regularizer_scale'],
                        batchSize=ea_params['batchSize'])
            try:
                veer.restore_model(self._get_tf_model_filename(dataset, veer))
            except Exception as e:
                logger.error(e)
                #Train Model
                loss, val_loss = veer.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f, val_loss:%f",
                            loss, val_loss)
                veer.save_model(self._get_tf_model_filename(dataset, veer))

            ent_embeddings = veer.get_val_embeddings()
            rel_embeddings = None
            entity = veer.get_values()
            veer.close_tf_session()
        else:
            raise Exception("Unknown Entity Alignment method")

        #Train WERL weights
        werl = WERL(model,
                    columns,
                    entity,
                    ent_embeddings,
                    rel_embeddings,
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])
        loss, val_loss = werl.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = werl.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Test Model
        logger = get_logger('RL.Test.MERL.' + str(dataset))
        result_prob, accuracy = werl.test_merl()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links)
        precison_at_1 = None  #ir_metrics.log_metrics(logger, params)

        #Test Without Weights = Mean Emebedding for Record Linkage
        logger = get_logger('RL.Test.NoWT.' + str(dataset))

        result_prob, accuracy = werl.test_without_weight()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, nowt_max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links)
        #nowt_precison_at_1 = ir_metrics.log_metrics(logger, params)
        werl.close_tf_session()

        return (max_fscore, precison_at_1)