Ejemplo n.º 1
0
    def __init__(self,
                 kg1: KG,
                 kg2: KG,
                 train_links,
                 test_links,
                 valid_links=None,
                 mode='mapping',
                 ordered=True):
        if mode == "sharing":
            ent_ids1, ent_ids2 = generate_sharing_id(train_links,
                                                     kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_sharing_id([],
                                                     kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_sharing_id(
                [],
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)
        else:
            ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_mapping_id(
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)

        id_relation_triples1 = uris_relation_triple_2ids(
            kg1.relation_triples_set, ent_ids1, rel_ids1)
        id_relation_triples2 = uris_relation_triple_2ids(
            kg2.relation_triples_set, ent_ids2, rel_ids2)

        id_attribute_triples1 = uris_attribute_triple_2ids(
            kg1.attribute_triples_set, ent_ids1, attr_ids1)
        id_attribute_triples2 = uris_attribute_triple_2ids(
            kg2.attribute_triples_set, ent_ids2, attr_ids2)

        self.uri_kg1 = kg1
        self.uri_kg2 = kg2

        kg1 = KG(id_relation_triples1, id_attribute_triples1)
        kg2 = KG(id_relation_triples2, id_attribute_triples2)
        kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1)
        kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2)

        self.uri_train_links = train_links
        self.uri_test_links = test_links
        self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1,
                                          ent_ids2)
        self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1,
                                         ent_ids2)
        self.train_entities1 = [link[0] for link in self.train_links]
        self.train_entities2 = [link[1] for link in self.train_links]
        self.test_entities1 = [link[0] for link in self.test_links]
        self.test_entities2 = [link[1] for link in self.test_links]

        if mode == 'swapping':
            sup_triples1, sup_triples2 = generate_sup_relation_triples(
                self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict,
                kg2.hr_dict)
            kg1.add_sup_relation_triples(sup_triples1)
            kg2.add_sup_relation_triples(sup_triples2)

            sup_triples1, sup_triples2 = generate_sup_attribute_triples(
                self.train_links, kg1.av_dict, kg2.av_dict)
            kg1.add_sup_attribute_triples(sup_triples1)
            kg2.add_sup_attribute_triples(sup_triples2)

        self.kg1 = kg1
        self.kg2 = kg2

        self.valid_links = list()
        self.valid_entities1 = list()
        self.valid_entities2 = list()
        if valid_links is not None:
            self.uri_valid_links = valid_links
            self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1,
                                              ent_ids2)
            self.valid_entities1 = [link[0] for link in self.valid_links]
            self.valid_entities2 = [link[1] for link in self.valid_links]

        self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1
        self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2

        self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set)
        self.relations_num = len(self.kg1.relations_set
                                 | self.kg2.relations_set)
        self.attributes_num = len(self.kg1.attributes_set
                                  | self.kg2.attributes_set)
Ejemplo n.º 2
0
    def __init__(self,
                 kg1: KG,
                 kg2: KG,
                 train_links,
                 test_links,
                 valid_links=None,
                 mode='mapping',
                 ordered=True,
                 extra_entities_percentage_valid=0.0):
        # BootEA: swapping (swap entities to generate extra triples), RDGCN: mapping (calibration?? -> min ||e_1-e_2||)
        if mode == "sharing":
            ent_ids1, ent_ids2 = generate_sharing_id(train_links,
                                                     kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_sharing_id([],
                                                     kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_sharing_id(
                [],
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)
        else:
            # generate unique id for each entity, relation, attribute (note id for same element is different in two KGs)
            ent_ids1, ent_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.entities_set,
                                                     kg2.relation_triples_set,
                                                     kg2.entities_set,
                                                     ordered=ordered)
            rel_ids1, rel_ids2 = generate_mapping_id(kg1.relation_triples_set,
                                                     kg1.relations_set,
                                                     kg2.relation_triples_set,
                                                     kg2.relations_set,
                                                     ordered=ordered)
            attr_ids1, attr_ids2 = generate_mapping_id(
                kg1.attribute_triples_set,
                kg1.attributes_set,
                kg2.attribute_triples_set,
                kg2.attributes_set,
                ordered=ordered)

        # convert to triples (id_ent, id_rel, id_ent)
        id_relation_triples1 = uris_relation_triple_2ids(
            kg1.relation_triples_set, ent_ids1, rel_ids1)
        id_relation_triples2 = uris_relation_triple_2ids(
            kg2.relation_triples_set, ent_ids2, rel_ids2)

        # convert to triples (id_ent, id_prop, literal)
        id_attribute_triples1 = uris_attribute_triple_2ids(
            kg1.attribute_triples_set, ent_ids1, attr_ids1)
        id_attribute_triples2 = uris_attribute_triple_2ids(
            kg2.attribute_triples_set, ent_ids2, attr_ids2)

        self.uri_kg1 = kg1
        self.uri_kg2 = kg2

        # rebuild kgs using ids, add dict generated before
        kg1 = KG(id_relation_triples1, id_attribute_triples1)
        kg2 = KG(id_relation_triples2, id_attribute_triples2)
        kg1.set_id_dict(ent_ids1, rel_ids1, attr_ids1)
        kg2.set_id_dict(ent_ids2, rel_ids2, attr_ids2)

        self.uri_train_links = train_links
        self.uri_test_links = test_links
        # convert link to (id_ent1, id_ent2)
        self.train_links = uris_pair_2ids(self.uri_train_links, ent_ids1,
                                          ent_ids2)
        self.test_links = uris_pair_2ids(self.uri_test_links, ent_ids1,
                                         ent_ids2)
        # TODO: here the entities used for testing will always some from the truth only...
        self.train_entities1 = [link[0] for link in self.train_links]
        self.train_entities2 = [link[1] for link in self.train_links]
        self.test_entities1 = [link[0] for link in self.test_links]
        self.test_entities2 = [link[1] for link in self.test_links]

        if mode == 'swapping':
            # generate new triples by swapping (see function for detail)
            sup_triples1, sup_triples2 = generate_sup_relation_triples(
                self.train_links, kg1.rt_dict, kg1.hr_dict, kg2.rt_dict,
                kg2.hr_dict)
            # add to the KGs
            kg1.add_sup_relation_triples(sup_triples1)
            kg2.add_sup_relation_triples(sup_triples2)

            # generate new attribute triples by adding all literals of KG1 to corresponding entity in KG2 and viceversa
            sup_triples1, sup_triples2 = generate_sup_attribute_triples(
                self.train_links, kg1.av_dict, kg2.av_dict)
            kg1.add_sup_attribute_triples(sup_triples1)
            kg2.add_sup_attribute_triples(sup_triples2)

        self.kg1 = kg1
        self.kg2 = kg2

        self.valid_links = list()
        self.valid_entities1 = list()
        self.valid_entities2 = list()
        # save validation links and entities (converted to ids)
        if valid_links is not None:
            self.uri_valid_links = valid_links
            self.valid_links = uris_pair_2ids(self.uri_valid_links, ent_ids1,
                                              ent_ids2)
            self.valid_entities1 = [link[0] for link in self.valid_links]
            self.valid_entities2 = [link[1] for link in self.valid_links]

        self.useful_entities_list1 = self.train_entities1 + self.valid_entities1 + self.test_entities1
        self.useful_entities_list2 = self.train_entities2 + self.valid_entities2 + self.test_entities2

        # Save the entities which are outside the truth for both KGs
        self.extra_entities1 = list(self.kg1.entities_set -
                                    set(self.useful_entities_list1))
        self.extra_entities2 = list(self.kg2.entities_set -
                                    set(self.useful_entities_list2))
        self.extra_entities_valid1 = random.sample(
            self.extra_entities1,
            int(len(self.extra_entities1) * extra_entities_percentage_valid))
        self.extra_entities_valid2 = random.sample(
            self.extra_entities2,
            int(len(self.extra_entities2) * extra_entities_percentage_valid))
        self.extra_entities_test1 = list(
            set(self.extra_entities1) - set(self.extra_entities_valid1))
        self.extra_entities_test2 = list(
            set(self.extra_entities2) - set(self.extra_entities_valid2))

        self.entities_num = len(self.kg1.entities_set | self.kg2.entities_set)
        self.relations_num = len(self.kg1.relations_set
                                 | self.kg2.relations_set)
        self.attributes_num = len(self.kg1.attributes_set
                                  | self.kg2.attributes_set)