def _test_transh(self, dataset, params):
        graph = Graph_ER(dataset)
        model = dataset()
        logger = get_logger('RL.Test.er.TransH.' + str(model))

        transh = TransH(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transh.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transh.get_ent_embeddings()
        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings[graph.entity_pairs[i][0]],
                    ent_embeddings[graph.entity_pairs[i][1]]))
            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs)

        #Write Embeddings to file
        export_embeddings('er', str(model), 'TransH', graph.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'er', str(model), 'TransH', graph.entity,
                           result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)

        try:
            logger.info("MAX FSCORE: %f AT : %f", max_fscore,
                        optimal_threshold)
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            params['threshold'] = optimal_threshold
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
            export_false_negatives(dataset, 'er', str(model), 'TransH',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result, graph.entity)
            export_false_positives(dataset, 'er', str(model), 'TransH',
                                   graph.entity, result_prob, graph.true_pairs,
                                   result, graph.entity)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        p_at_1 = ir_metrics.log_metrics(logger, params)

        transh.close_tf_session()
        return (max_fscore, p_at_1)
Example #2
0
    def _test_rl_transe(self, dataset, params):
        #Load Graph Data
        graph = Graph_ER(dataset)
        model = dataset()
        logger = get_logger('RL.Test.er.RLTransE.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        result_prob = []
        for (a, b) in graph.entity_pairs:
            a_triples = [(h, t, r) for (h, t, r) in graph.triples if h == a]
            b_triples = [(h, t, r) for (h, t, r) in graph.triples if h == b]

            distance = abs(
                spatial.distance.cosine(ent_embeddings[a], ent_embeddings[b]))
            for (ah, at, ar) in a_triples:
                bt = [t for (h, t, r) in b_triples if r == ar]
                if len(bt):
                    distance = distance + abs(spatial.distance.cosine(\
                                            ent_embeddings[at], ent_embeddings[bt[0]]))
            result_prob.append((a, b, distance))
            #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a, b) in graph.true_pairs)

        #Write Embeddings to file
        export_embeddings('er', str(model), 'RLTransE', graph.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'er', str(model), 'RLTransE', graph.entity,
                           result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs, max_threshold=3.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return (max_fscore, precison_at_1)
Example #3
0
    def _test_etranse(self, dataset, params):
        model = dataset()
        graph = Graph_ERER(dataset)
        logger = get_logger("RL.Test.erer.ETransE." + str(model))

        etranse = ETransE(graph,
                          dimension=params['dimension'],
                          batchSize=params['batchSize'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'],
                          regularizer_scale=params['regularizer_scale'],
                          alpha=params['alpha'],
                          beta=params['beta'])
        etranse.train(max_epochs=params['max_epochs'])
        ent_embeddings_a = etranse.get_ent_embeddings_A()
        ent_embeddings_b = etranse.get_ent_embeddings_B()

        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings_a[int(graph.entity_pairs[i][0])],
                    ent_embeddings_b[int(graph.entity_pairs[i][1])]))
            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph.entity_pairs[i] in true_pairs)

        #Write Embeddings to file
        export_embeddings('erer', str(model), 'ETransE', graph.entityA,
                          ent_embeddings_a)
        export_embeddings('erer', str(model), 'ETransE', graph.entityB,
                          ent_embeddings_b)
        export_result_prob(dataset, 'erer', str(model), 'ETransE',
                           graph.entityA, result_prob, graph.true_pairs,
                           graph.entityB)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        prec_at_1 = ir_metrics.log_metrics(logger, params)

        etranse.close_tf_session()
        return (max_fscore, prec_at_1)
Example #4
0
    def _test_erer(self, dataset, er_algo, params):
        model = dataset()
        graph = Graph_ERER(dataset)
        graph_er = graph.get_er_model()

        er_model = er_algo(graph_er,
                           dimension=params['dimension'],
                           learning_rate=params['learning_rate'],
                           margin=params['margin'],
                           regularizer_scale=params['regularizer_scale'],
                           batchSize=params['batchSize'],
                           neg_rate=params['neg_rate'],
                           neg_rel_rate=params['neg_rel_rate'])
        loss = er_model.train(max_epochs=params['epochs'])

        logger = get_logger('RL.Test.erer.ERER.' + str(model) + "." +
                            str(er_model))
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = er_model.get_ent_embeddings()
        result_prob = []
        for i in range(0, len(graph_er.entity_pairs)):
            distance = abs(
                spatial.distance.cosine(
                    ent_embeddings[graph_er.entity_pairs[i][0]],
                    ent_embeddings[graph_er.entity_pairs[i][1]]))
            result_prob.append((graph_er.entity_pairs[i][0],
                                graph_er.entity_pairs[i][1], distance))
            #logger.info("i: %d, distance: %f true_pairs: %s", i, distance, graph_er.entity_pairs[i] in graph_er.true_pairs)

        #Write Embeddings to file
        export_embeddings("erer", str(model), str(er_model), graph_er.entity,
                          ent_embeddings)
        export_result_prob(dataset, 'erer', str(model), str(er_model),
                           graph_er.entity, result_prob, graph_er.true_pairs)

        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph_er.true_pairs)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph_er.true_pairs,
                                len(graph_er.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 graph_er.true_pairs)
        ir_metrics.log_metrics(logger)

        er_model.close_tf_session()
        return max_fscore
Example #5
0
    def _test_veer(self, model, columns, params):
        #Load Graph Data
        dataset = model()
        logger = get_logger('RL.Test.VEER.' + str(dataset))

        veer = VEER(model,
                    columns,
                    dimension=params['dimension'],
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])

        #Train Model
        loss, val_loss = veer.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = veer.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(veer.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))
        logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0]))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        #Write Result Prob to file
        entitiesA = dataset.get_entity_names(dataset.testDataA)
        entitiesB = dataset.get_entity_names(dataset.testDataB)
        index_dictA = {
            str(dataset.testDataA.iloc[i]._name): i
            for i in range(dataset.testDataA.shape[0])
        }
        index_dictB = {
            str(dataset.testDataB.iloc[i]._name): i
            for i in range(dataset.testDataB.shape[0])
        }
        result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p)
                       for (a, b, p) in result_prob]
        true_links = [(index_dictA[str(a)], index_dictB[str(b)])
                      for (a, b) in dataset.true_test_links]
        export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA,
                           result_prob, true_links, entitiesB)

        result = [(index_dictA[str(a)], index_dictB[str(b)])
                  for (a, b) in result]
        export_false_negatives(model, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(model, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, true_links, result, entitiesB)

        veer.close_tf_session()
        return (max_fscore, precison_at_1)
    def test_cora(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(Cora)
        model = Cora()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        auth_rel_index = graph.relation.index('author')
        result_prob = []
        for ep_index in range(0, len(graph.entity_pairs)):
            authors_A = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][0] and r == auth_rel_index
            ]
            #logger.info("AUHTORS A: %s", str([graph.entity[a] for a in authors_A]))
            authors_B = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][1] and r == auth_rel_index
            ]
            #logger.info("AUHTORS B: %s", str([graph.entity[a] for a in authors_B]))

            cost_matrix = np.zeros(shape=(len(authors_A), len(authors_B)))
            for i in range(len(authors_A)):
                for j in range(len(authors_B)):
                    #if authors_A[i] == authors_B[j]:
                    #    cost_matrix[i][j] = 100
                    #else:
                    cost_matrix[i][j] = abs(
                        spatial.distance.cosine(ent_embeddings[authors_A[i]],
                                                ent_embeddings[authors_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())
            distance = cost_matrix[row_ind, col_ind].sum() + abs(
                spatial.distance.cosine(
                    ent_embeddings[graph.entity_pairs[ep_index][0]],
                    ent_embeddings[graph.entity_pairs[ep_index][1]]))
            result_prob.append((graph.entity_pairs[ep_index][0],
                                graph.entity_pairs[ep_index][1], distance))
            if distance <= 0.05:
                logger.info("i: %d, distance: %f true_pairs: %s", ep_index,
                            distance, graph.entity_pairs[ep_index]
                            in graph.true_pairs)

        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(Cora, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
    def test_febrl(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(FEBRL)
        model = FEBRL()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        postcode_rel_id = graph.relation.index("postcode")
        result_prob = []
        for i in range(0, len(graph.entity_pairs)):
            person_A = graph.entity_pairs[i][0]
            person_B = graph.entity_pairs[i][1]

            postcode_A = [
                t for (h, t, r) in graph.triples
                if h == person_A and r == postcode_rel_id
            ][0]
            neighbours_A = [
                h for (h, t, r) in graph.triples if t == postcode_A
            ]
            #logger.info("FM A: %s", str([graph.entity[a] for a in neighbours_A]))
            postcode_B = [
                t for (h, t, r) in graph.triples
                if h == person_B and r == postcode_rel_id
            ][0]
            neighbours_B = [
                h for (h, t, r) in graph.triples if t == postcode_B
            ]
            #logger.info("FM B: %s", str([graph.entity[a] for a in neighbours_B]))

            cost_matrix = np.zeros(shape=(len(neighbours_A),
                                          len(neighbours_B)))
            for i in range(len(neighbours_A)):
                for j in range(len(neighbours_B)):
                    if neighbours_A[i] == neighbours_B[j]:
                        cost_matrix[i][j] = 100
                    else:
                        cost_matrix[i][j] = abs(
                            spatial.distance.cosine(
                                ent_embeddings[neighbours_A[i]],
                                ent_embeddings[neighbours_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())

            person_A_index = neighbours_A.index(person_A)
            person_B_index = neighbours_B.index(person_B)
            distance = cost_matrix[row_ind, col_ind].sum(
            ) + cost_matrix[person_A_index][person_B_index]
            #import ipdb;ipdb.set_trace()
            #if (person_A_index, person_B_index) not in (row_ind, col_ind):
            #   distance = distance + 1000

            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))

        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(FEBRL, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
    def test_census(self, params=None):
        if not params:
            params = self.get_default_params()

        #Load Graph Data
        graph = Graph_ER(Census)
        model = Census()
        logger = get_logger('RL.Test.TransE.Household.' + str(model))

        transe = TransE(graph,
                        dimension=params['dimension'],
                        learning_rate=params['learning_rate'],
                        margin=params['margin'],
                        regularizer_scale=params['regularizer_scale'],
                        batchSize=params['batchSize'],
                        neg_rate=params['neg_rate'],
                        neg_rel_rate=params['neg_rel_rate'])
        loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f", loss)

        ent_embeddings = transe.get_ent_embeddings()

        #Experimenting household matching
        result_prob = []
        for ep_index in range(0, len(graph.entity_pairs)):
            #logger.info("Computing cost for: %s", str([graph.entity[e] for e in graph.entity_pairs[ep_index]]))
            household_A = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][0] and r > 6
            ][0]
            family_members_A = [
                h for (h, t, r) in graph.triples if t == household_A
            ]
            #logger.info("FM A: %s", str([graph.entity[a] for a in family_members_A]))
            household_B = [
                t for (h, t, r) in graph.triples
                if h == graph.entity_pairs[ep_index][1] and r > 6
            ][0]
            family_members_B = [
                h for (h, t, r) in graph.triples if t == household_B
            ]
            #logger.info("FM B: %s", str([graph.entity[a] for a in family_members_B]))

            cost_matrix = np.zeros(shape=(len(family_members_A),
                                          len(family_members_B)))
            for i in range(len(family_members_A)):
                for j in range(len(family_members_B)):
                    #if family_members_A[i] == family_members_B[j]:
                    #    cost_matrix[i][j] = 100
                    #else:
                    cost_matrix[i][j] = abs(
                        spatial.distance.cosine(
                            ent_embeddings[family_members_A[i]],
                            ent_embeddings[family_members_B[j]]))

            #logger.info("Cost Matrix: %s", str(cost_matrix))

            row_ind, col_ind = linear_sum_assignment(cost_matrix)
            #logger.info("Cost of aligning = %f", cost_matrix[row_ind, col_ind].sum())
            #logger.info("Rows selected %s, Col selected: %s", str(row_ind), str(col_ind))

            eA_index = family_members_A.index(graph.entity_pairs[ep_index][0])
            eB_index = family_members_B.index(graph.entity_pairs[ep_index][1])
            #logger.info("A index: %d, B index: %d", eA_index, eB_index)

            rowA = np.where(row_ind == eA_index)[0]
            if len(rowA) and col_ind[rowA[0]] == eB_index:
                #logger.info("Pair in min. cost matrix")
                distance = cost_matrix[row_ind, col_ind].sum()
            else:
                distance = cost_matrix[row_ind, col_ind].sum() + abs(
                    spatial.distance.cosine(
                        ent_embeddings[graph.entity_pairs[ep_index][0]],
                        ent_embeddings[graph.entity_pairs[ep_index][1]]))

            result_prob.append(
                (graph.entity_pairs[i][0], graph.entity_pairs[i][1], distance))
            if ep_index % 1000 == 0:
                logger.info("i: %d, distance: %f true_pairs: %s", ep_index,
                            distance, graph.entity_pairs[ep_index]
                            in graph.true_pairs)
            #if graph.entity_pairs[ep_index] in graph.true_pairs:
            #    import ipdb;ipdb.set_trace()
        #Normalize distance
        max_distance = 10
        #for r in result_prob:
        #    if r[2] > max_distance:
        #        max_distance = r[2]
        result_prob = [(r[0], r[1], (r[2] / max_distance))
                       for r in result_prob]
        #logger.info("Max distance: %f", max_distance)

        for r in result_prob[:100]:
            logger.info("distance: %f true_pairs: %s", r[2], (r[0], r[1])
                        in graph.true_pairs)
        export_embeddings('er', str(model), 'TransE.Household', graph.entity,
                          ent_embeddings)
        export_result_prob(Census, 'er', str(model), 'TransE.Household',
                           graph.entity, result_prob, graph.true_pairs)
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, graph.true_pairs)
        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, graph.true_pairs,
                                len(graph.entity_pairs), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob, graph.true_pairs)
        ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()
        return max_fscore
Example #9
0
    def _test_rl_transe(self, model, field_relation_map, params):
        dataset = model()
        graph = Graph_VEG(model)
        logger = get_logger("RL.Test.RLTransE." + str(dataset))
        logger.info("values for name : %s",
                    str(graph.relation_value_map[graph.relation[1]][:10]))
        logger.info("relation: %s", str(graph.relation))
        logger.info("train_triples: %s", str(graph.train_triples[:10]))
        logger.info("set train_triples size %d", len(set(graph.train_triples)))

        transe = RLTransE(graph,
                          dimension=params['dimension'],
                          learning_rate=params['learning_rate'],
                          margin=params['margin'],
                          regularizer_scale=params['regularizer_scale'],
                          batchSize=params['batchSize'],
                          neg_rate=params['neg_rate'],
                          neg_rel_rate=params['neg_rel_rate'])
        loss, val_loss = transe.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f val_loss: %f", loss,
                    val_loss)

        value_embeddings = transe.get_val_embeddings()
        relation_embeddings = transe.get_rel_embeddings()

        result_prob = []
        distance_distribution = []
        missing_values = []
        for (a, b) in dataset.test_links:
            row_a = dataset.testDataA.loc[a]
            row_b = dataset.testDataB.loc[b]

            distance = 0
            dd = []
            for f in field_relation_map:
                val_a = row_a[f]
                val_b = row_b[f]
                if val_a == val_b:
                    dd.append(0)
                else:
                    rel = field_relation_map[f]
                    try:
                        val_index_a = graph.relation_value_map[rel].index(
                            val_a)
                    except ValueError:
                        missing_values.append(val_a)
                        distance = distance + 1
                        dd.append(1)
                        continue
                    try:
                        val_index_b = graph.relation_value_map[rel].index(
                            val_b)
                    except ValueError:
                        missing_values.append(val_b)
                        distance = distance + 1
                        dd.append(1)
                        continue
                    rel_index = graph.relation.index(field_relation_map[f])

                    cur_distance = abs(
                        spatial.distance.cosine(
                            value_embeddings[rel][val_index_a] +
                            relation_embeddings[rel_index],
                            value_embeddings[rel][val_index_b]))
                    distance = distance + cur_distance
                    dd.append(cur_distance)

            result_prob.append((a, b, distance))
            distance_distribution.append((a, b, dd, distance))
            #logger.info("a: %d, b: %d distance: %f true_pairs: %s", a, b, distance, (a,b) in dataset.true_test_links)
        logger.info("No. of missing values: %d", len(missing_values))
        logger.info("Unique No. of missing values: %d",
                    len(set(missing_values)))
        try:
            entities = ["value\trelation"]
            for r in graph.relation_value_map:
                for v in graph.relation_value_map[r]:
                    entities.append("\t".join([v, r]))

            embeddings = []
            for rel in value_embeddings:
                val_count = len(graph.relation_value_map[rel])
                embeddings.extend(value_embeddings[rel][:val_count])

            #Write Embeddings to file
            export_embeddings('veg', str(dataset), 'RLTransE_val', entities,
                              embeddings)
            export_embeddings('veg', str(dataset), 'RLTransE_rel',
                              graph.relation, relation_embeddings)
        except Exception as e:
            logger.error("Failed to export embeddings")
            logger.error(e)

        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=3.0, step=0.02)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except:
            logger.info("Zero Reults")

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        transe.close_tf_session()

        #Export False Positives and result porobabilities
        get_entity_name = lambda d, i: "_".join([
            str(d.iloc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.iloc[i][dataset.field_map[CensusFields.DNI]])
        ])
        get_entity_name_loc = lambda d, i: "_".join([
            str(d.loc[i][dataset.field_map[CensusFields.ID_INDIVIDUAL]]),
            str(d.loc[i][dataset.field_map[CensusFields.DNI]])
        ])
        entitiesA = [
            get_entity_name(dataset.testDataA, i)
            for i in range(int(dataset.testDataA.shape[0]))
        ]
        entitiesB = [
            get_entity_name(dataset.testDataB, i)
            for i in range(int(dataset.testDataB.shape[0]))
        ]
        result_prob = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB,
                                                 int(b))), p)
            for (a, b, p) in result_prob
        ]
        true_links = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))))
            for (a, b) in dataset.true_test_links
        ]
        export_result_prob(Census, 'veg', 'census', 'rltranse', entitiesA,
                           result_prob, true_links, entitiesB)

        distance_distribution = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))),
             [str("%.2f" % (float(w))) for w in dd], 1 - d)
            for (e1, e2, dd, d) in distance_distribution if (e1, e2) in result
        ]
        export_human_readable_results(Census, 'veg', 'census', 'rltranse',
                                      entitiesA, distance_distribution,
                                      entitiesB)

        result = [
            (entitiesA.index(get_entity_name_loc(dataset.testDataA, int(a))),
             entitiesB.index(get_entity_name_loc(dataset.testDataB, int(b))))
            for (a, b) in result
        ]
        export_false_negatives(Census, 'veg', 'census', 'rltranse', entitiesA,
                               result_prob, true_links, result, entitiesB)
        export_false_positives(Census, 'veg', 'census', 'rltranse', entitiesA,
                               result_prob, true_links, result, entitiesB)

        return (max_fscore, precison_at_1)
    def test_veer(self):
        logger = get_logger('RL.Test.VEER.Census')

        dataset = Census()

        #Columns of interest for Sant Feliu town
        columns = [
            'Noms_harmo', 'cognom_1', 'cohort', 'estat_civil', 'parentesc_har',
            'ocupacio_hisco'
        ]
        params = {
            'learning_rate': 0.1,
            'margin': 0.1,
            'dimension': 32,
            'epochs': 50,
            'regularizer_scale': 0.1,
            'batchSize': 512
        }

        veer = VEER(Census,
                    columns,
                    dimension=params['dimension'],
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])

        #Train Model
        loss, val_loss = veer.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = veer.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(veer.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))
        logger.info("Sample embeddings: %s", str(veer.get_val_embeddings()[0]))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        ir_metrics = InformationRetrievalMetrics(result_prob,
                                                 dataset.true_test_links)
        precison_at_1 = ir_metrics.log_metrics(logger, params)

        #Export embeddings
        embeddings = veer.get_val_embeddings()
        export_embeddings('veg', 'census', 'veer', veer.values, embeddings)

        #Write Result Prob to file
        result_feature_mapping = [(e1, e2, [
            str(
                abs(
                    spatial.distance.cosine(
                        embeddings[veer.values.index(
                            veer._clean(dataset.testDataA.loc[e1][c]))],
                        embeddings[veer.values.index(
                            veer._clean(dataset.testDataB.loc[e2][c]))])))
            for c in columns
        ], d) for (e1, e2, d) in result_prob if (e1, e2) in result]

        entitiesA = dataset.get_entity_names(dataset.testDataA)
        entitiesB = dataset.get_entity_names(dataset.testDataB)
        index_dictA = {
            str(dataset.testDataA.iloc[i]._name): i
            for i in range(dataset.testDataA.shape[0])
        }
        index_dictB = {
            str(dataset.testDataB.iloc[i]._name): i
            for i in range(dataset.testDataB.shape[0])
        }
        result_prob = [(index_dictA[str(a)], index_dictB[str(b)], p)
                       for (a, b, p) in result_prob]
        export_result_prob(dataset, 'veg', str(dataset), 'VEER', entitiesA,
                           result_prob, dataset.true_test_links, entitiesB)
        export_false_negatives(Census, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, dataset.true_test_links, result,
                               entitiesB)
        export_false_positives(Census, 'veg', str(dataset), 'VEER', entitiesA,
                               result_prob, dataset.true_test_links, result,
                               entitiesB)

        result_feature_mapping = [(index_dictA[str(a)], index_dictB[str(b)], w,
                                   p)
                                  for (a, b, w, p) in result_feature_mapping]
        export_human_readable_results(Census, 'veg', str(dataset), 'VEER',
                                      entitiesA, result_feature_mapping,
                                      entitiesB)

        veer.close_tf_session()
Example #11
0
    def _test_werl(self, model, columns, params):
        #Load Graph Data
        dataset = model()
        logger = get_logger('RL.Test.WERL.' + str(dataset))
        ea_params = self.get_optimal_ea_params(model, params['ea_method'])
        if params['ea_method'] in [TransE, TransH]:
            #ER methods
            graph = Graph_ER(model)
            #Train TransE embedding vectors
            transe = params['ea_method'](
                graph,
                dimension=ea_params['dimension'],
                learning_rate=ea_params['learning_rate'],
                margin=ea_params['margin'],
                regularizer_scale=ea_params['regularizer_scale'],
                batchSize=ea_params['batchSize'],
                neg_rate=ea_params['neg_rate'],
                neg_rel_rate=ea_params['neg_rel_rate'])
            try:
                #raise Exception("Reset")
                transe.restore_model(
                    self._get_tf_model_filename(dataset, transe))
            except Exception as e:
                logger.error(e)
                loss = transe.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f", loss)
                transe.save_model(self._get_tf_model_filename(dataset, transe))

            ent_embeddings = transe.get_ent_embeddings()
            rel_embeddings = None
            entity = graph.entity
            transe.close_tf_session()
        elif params['ea_method'] in [RLTransE]:
            #VEG methods
            graph = Graph_VEG(model)
            #Train TransE embedding vectors
            rltranse = params['ea_method'](
                graph,
                dimension=ea_params['dimension'],
                learning_rate=ea_params['learning_rate'],
                margin=ea_params['margin'],
                regularizer_scale=ea_params['regularizer_scale'],
                batchSize=ea_params['batchSize'],
                neg_rate=ea_params['neg_rate'],
                neg_rel_rate=ea_params['neg_rel_rate'])

            try:
                #raise Exception("Reset")
                rltranse.restore_model(
                    self._get_tf_model_filename(dataset, rltranse))
            except Exception as e:
                logger.error(e)
                loss, val_loss = rltranse.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f", loss)
                rltranse.save_model(
                    self._get_tf_model_filename(dataset, rltranse))

            val_embeddings = rltranse.get_val_embeddings()
            rel_embeddings = rltranse.get_rel_embeddings()
            if model == Census:
                #hack: census veg graph has 8 relations. we need only 6
                #removing same_as and surname2 embedding.
                rel_embeddings = np.append(rel_embeddings[1:3],
                                           rel_embeddings[4:],
                                           axis=0)
            ent_embeddings = []
            entity = []

            for rel in val_embeddings:
                val_count = len(graph.relation_value_map[rel])
                entity.extend(graph.relation_value_map[rel])
                ent_embeddings.extend(val_embeddings[rel][:val_count])

            assert len(ent_embeddings) == len(entity)

            rltranse.close_tf_session()
        elif params['ea_method'] in [VEER]:
            veer = VEER(model,
                        columns,
                        dimension=ea_params['dimension'],
                        learning_rate=ea_params['learning_rate'],
                        margin=ea_params['margin'],
                        regularizer_scale=ea_params['regularizer_scale'],
                        batchSize=ea_params['batchSize'])
            try:
                veer.restore_model(self._get_tf_model_filename(dataset, veer))
            except Exception as e:
                logger.error(e)
                #Train Model
                loss, val_loss = veer.train(max_epochs=ea_params['epochs'])
                logger.info("Training Complete with loss: %f, val_loss:%f",
                            loss, val_loss)
                veer.save_model(self._get_tf_model_filename(dataset, veer))

            ent_embeddings = veer.get_val_embeddings()
            rel_embeddings = None
            entity = veer.get_values()
            veer.close_tf_session()
        else:
            raise Exception("Unknown Entity Alignment method")

        #Train WERL weights
        werl = WERL(model,
                    columns,
                    entity,
                    ent_embeddings,
                    rel_embeddings,
                    learning_rate=params['learning_rate'],
                    margin=params['margin'],
                    regularizer_scale=params['regularizer_scale'],
                    batchSize=params['batchSize'])
        loss, val_loss = werl.train(max_epochs=params['epochs'])
        logger.info("Training Complete with loss: %f, val_loss:%f", loss,
                    val_loss)

        #Test Model
        result_prob, accuracy = werl.test()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Test Model
        logger = get_logger('RL.Test.MERL.' + str(dataset))
        result_prob, accuracy = werl.test_merl()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links)
        precison_at_1 = None  #ir_metrics.log_metrics(logger, params)

        #Test Without Weights = Mean Emebedding for Record Linkage
        logger = get_logger('RL.Test.NoWT.' + str(dataset))

        result_prob, accuracy = werl.test_without_weight()
        logger.info("Predict count: %d", len(result_prob))
        logger.info(
            "Sample Prob: %s",
            str([(c, (a, b) in dataset.true_test_links)
                 for (a, b, c) in result_prob[:20]]))
        logger.info("Column Weights: %s", str(werl.get_col_weights()))
        logger.info("Accuracy: %s", str(accuracy))

        #Compute Performance measures
        optimal_threshold, nowt_max_fscore = get_optimal_threshold(
            result_prob, dataset.true_test_links, max_threshold=2.0)

        try:
            params['threshold'] = optimal_threshold
            result = pd.MultiIndex.from_tuples([(e1, e2)
                                                for (e1, e2, d) in result_prob
                                                if d <= optimal_threshold])
            log_quality_results(logger, result, dataset.true_test_links,
                                len(dataset.test_links), params)
        except Exception as e:
            logger.info("Zero Reults")
            logger.error(e)

        #Log MAP, MRR and Hits@K
        #ir_metrics = InformationRetrievalMetrics(result_prob, dataset.true_test_links)
        #nowt_precison_at_1 = ir_metrics.log_metrics(logger, params)
        werl.close_tf_session()

        return (max_fscore, precison_at_1)