Esempio n. 1
0
 def test_triples(self):
     """Test properties of the triples factory."""
     triples_factory = TriplesFactory(triples=triples)
     self.assertEqual(set(range(triples_factory.num_entities)),
                      set(triples_factory.entity_to_id.values()))
     self.assertEqual(set(range(triples_factory.num_relations)),
                      set(triples_factory.relation_to_id.values()))
     self.assertTrue((triples_factory.mapped_triples ==
                      triples_factory.map_triples_to_id(triples)).all())
Esempio n. 2
0
    def train_embedding(self, g, model="SimplE"):
        # pykeen
        from pykeen.pipeline import pipeline
        from pykeen.triples import TriplesFactory

        # create pseudo-nodes to enucode node attributes
        pleasent, not_pleasent = len(g.concepts), len(g.concepts) + 1
        sensitiv, not_sensitive = len(g.concepts) + 2, len(g.concepts) + 3
        # build triples
        triples = []
        for c in g.concepts:
            # actual connections
            triples.extend(([c.index, 'semantic', j] for j in g.get_semantic_ids(c)))
            # encode attributes by binning
            if c.pleasentness != 0:
                triples.append([c.index, 'pleasent', pleasent if c.pleasentness > 0 else not_pleasent])
            if c.sensitivity != 0:
                triples.append([c.index, 'sensitiv', sensitiv if c.sensitivity > 0 else not_sensitive])
        triples, n = np.asarray(triples), len(triples)
        print("Number of Triples (Train/Total): %i/%i" % (int(0.8 * n), n))
        # create mask for training and testing separation
        train_mask = np.full(n, False)
        train_mask[:int(n * 0.9)] = True
        np.random.shuffle(train_mask)
        # separate into training and testing
        train_triples = triples[train_mask]
        test_triples = triples[~train_mask]
        # create triples factories
        train_factory = TriplesFactory(triples=train_triples)
        test_factory = TriplesFactory(triples=test_triples)
        # create and run pipeline
        results = pipeline(
            # data
            training_triples_factory=train_factory,
            testing_triples_factory=test_factory,
            # model
            model=model,
            model_kwargs={
                "embedding_dim": self.embedd_dim,
                "automatic_memory_optimization": True
            }
        )
        # get embedding tensor - remove pseudo nodes
        weight = results.model.entity_embeddings.weight[:len(g.concepts), ...].cpu()

        # update word2id
        words = [c.text for c in g.concepts]
        self.word2id = OrderedDict( zip(words, range(1, len(words) + 1)) )  # 0th element is padding
        # update embeddings - add padding embedding at position 0
        self.embedding = nn.Embedding(
            num_embeddings=len(words) + 1,
            embedding_dim=self.embedd_dim,
            _weight=torch.cat((torch.zeros((1, self.embedd_dim)), weight), dim=0)
        )
        # return results
        return results
    def test_inverse_triples(self):
        """Test that the right number of entities and triples exist after inverting them."""
        triples_factory = TriplesFactory(triples=triples, create_inverse_triples=True)
        self.assertEqual(0, triples_factory.num_relations % 2)
        self.assertEqual(
            set(range(triples_factory.num_entities)),
            set(triples_factory.entity_to_id.values()),
            msg='wrong number entities',
        )
        self.assertEqual(
            set(range(triples_factory.num_relations)),
            set(triples_factory.relation_to_id.values()),
            msg='wrong number relations',
        )

        relations = set(triples[:, 1])
        entities = set(triples[:, 0]).union(triples[:, 2])
        self.assertEqual(len(entities), triples_factory.num_entities, msg='wrong number entities')
        self.assertEqual(2, len(relations), msg='Wrong number of relations in set')
        self.assertEqual(
            2 * len(relations),
            triples_factory.num_relations,
            msg='Wrong number of relations in factory',
        )

        self.assertIn(f'likes{INVERSE_SUFFIX}', triples_factory.relation_to_id)
Esempio n. 4
0
    def test_inverse_triples(self):
        """Test that the right number of entities and triples exist after inverting them."""
        triples_factory = TriplesFactory.from_labeled_triples(
            triples=triples, create_inverse_triples=True)
        self.assertEqual(4, triples_factory.num_relations)
        self.assertEqual(
            set(range(triples_factory.num_entities)),
            set(triples_factory.entity_to_id.values()),
            msg="wrong number entities",
        )
        self.assertEqual(
            set(range(triples_factory.real_num_relations)),
            set(triples_factory.relation_to_id.values()),
            msg="wrong number relations",
        )

        relations = set(triples[:, 1])
        entities = set(triples[:, 0]).union(triples[:, 2])
        self.assertEqual(len(entities),
                         triples_factory.num_entities,
                         msg="wrong number entities")
        self.assertEqual(2,
                         len(relations),
                         msg="Wrong number of relations in set")
        self.assertEqual(
            2 * len(relations),
            triples_factory.num_relations,
            msg="Wrong number of relations in factory",
        )
Esempio n. 5
0
    def test_count_inverse_frequencies(self):
        """Test counting inverse frequencies.

        Note, for r3, there are three triples, but the inverse triples are only counted once.
        """
        t = [
            ['a', 'r1', 'b'],
            #
            ['b', 'r2', 'c'],
            ['c', 'r2_inverse', 'b'],
            ['d', 'r2', 'e'],
            ['e', 'r2_inverse', 'd'],
            #
            ['g', 'r3', 'h'],
            ['h', 'r3_inverse', 'g'],
            ['i', 'r3', 'j'],
            ['k', 'r3', 'l'],
        ]
        triples_factory = TriplesFactory.from_labeled_triples(
            triples=np.array(t, dtype=np.str))
        frequencies = get_candidate_inverse_relations(triples_factory,
                                                      minimum_frequency=0.0,
                                                      symmetric=False)
        self.assertEqual(
            {
                ('r2', 'r2_inverse'): (2 / 2),
                ('r2_inverse', 'r2'): (2 / 2),
                ('r3', 'r3_inverse'): (1 / 3),
                ('r3_inverse', 'r3'): (1 / 1),
            },
            dict(frequencies),
        )
 def test_right_sorting(self):
     """Test if the triples and the corresponding inverses are sorted correctly."""
     t = [
         ['e1', 'a', 'e1'],
         ['e1', 'a.', 'e1'],
         ['e1', f'a.{INVERSE_SUFFIX}', 'e1'],
         ['e1', 'a.bc', 'e1'],
         ['e1', f'a.bc{INVERSE_SUFFIX}', 'e1'],
         ['e1', f'a{INVERSE_SUFFIX}', 'e1'],
         ['e1', 'abc', 'e1'],
         ['e1', f'abc{INVERSE_SUFFIX}', 'e1'],
     ]
     t = np.array(t, dtype=np.str)
     factory = TriplesFactory(triples=t, create_inverse_triples=False)
     reference_relation_to_id = {
         'a': 0,
         f'a{INVERSE_SUFFIX}': 1,
         'a.': 2,
         f'a.{INVERSE_SUFFIX}': 3,
         'a.bc': 4,
         f'a.bc{INVERSE_SUFFIX}': 5,
         'abc': 6,
         f'abc{INVERSE_SUFFIX}': 7,
     }
     self.assertEqual(reference_relation_to_id, factory.relation_to_id)
Esempio n. 7
0
 def test_triples(self):
     """Test properties of the triples factory."""
     triples_factory = TriplesFactory.from_labeled_triples(triples=triples)
     self.assertEqual(set(range(triples_factory.num_entities)), set(triples_factory.entity_to_id.values()))
     self.assertEqual(set(range(triples_factory.num_relations)), set(triples_factory.relation_to_id.values()))
     assert (_map_triples_elements_to_ids(
         triples=triples,
         entity_to_id=triples_factory.entity_to_id,
         relation_to_id=triples_factory.relation_to_id,
     ) == triples_factory.mapped_triples).all()
Esempio n. 8
0
 def test_correct_inverse_creation(self):
     """Test if the triples and the corresponding inverses are created."""
     t = [
         ['e1', 'a.', 'e5'],
         ['e1', 'a', 'e2'],
     ]
     t = np.array(t, dtype=np.str)
     factory = TriplesFactory.from_labeled_triples(triples=t, create_inverse_triples=True)
     instances = factory.create_slcwa_instances()
     assert len(instances) == 4
 def test_correct_inverse_creation(self):
     """Test if the triples and the corresponding inverses are created and sorted correctly."""
     t = [
         ['e1', 'a.', 'e5'],
         ['e1', 'a', 'e2'],
     ]
     t = np.array(t, dtype=np.str)
     factory = TriplesFactory(triples=t, create_inverse_triples=True)
     reference_relation_to_id = {'a': 0, f'a{INVERSE_SUFFIX}': 1, 'a.': 2, f'a.{INVERSE_SUFFIX}': 3}
     self.assertEqual(reference_relation_to_id, factory.relation_to_id)
Esempio n. 10
0
    def __load_train_data(self, train_data_path, validation_path, test_data_path):
        """
        Reading training data in and loading in the generating required tensors

        """
        self.__training = TriplesFactory(
            path=train_data_path,
        )
        
        self.__valid = TriplesFactory(
            path=validation_path,
            entity_to_id=self.__training.entity_to_id,
            relation_to_id=self.__training.relation_to_id,
        )
        
        self.__testing = TriplesFactory(
            path=test_data_path,
            entity_to_id=self.__training.entity_to_id,
            relation_to_id=self.__training.relation_to_id,
        )
Esempio n. 11
0
 def test_automatic_incomplete_inverse_detection(self):
     """Test if the TriplesFactory detects that the triples contain incomplete inverses and creates correct ids."""
     t = [
         ['e3', f'a.{INVERSE_SUFFIX}', 'e10'],
         ['e1', 'a', 'e2'],
         ['e1', 'a.', 'e5'],
     ]
     t = np.array(t, dtype=np.str)
     factory = TriplesFactory(triples=t, create_inverse_triples=False)
     reference_relation_to_id = {'a': 0, f'a{INVERSE_SUFFIX}': 1, 'a.': 2, f'a.{INVERSE_SUFFIX}': 3}
     self.assertEqual(reference_relation_to_id, factory.relation_to_id)
     self.assertTrue(factory.create_inverse_triples)
Esempio n. 12
0
    def _test_restriction(
        self,
        original_triples_factory: TriplesFactory,
        entity_restriction: Optional[Collection[str]],
        invert_entity_selection: bool,
        relation_restriction: Optional[Collection[str]],
        invert_relation_selection: bool,
    ):
        """Run the actual test for new_with_restriction."""
        # apply restriction
        restricted_triples_factory = original_triples_factory.new_with_restriction(
            entities=entity_restriction,
            relations=relation_restriction,
            invert_entity_selection=invert_entity_selection,
            invert_relation_selection=invert_relation_selection,
        )

        # check that the triples factory is returned as is, if and only if no restriction is to apply
        no_restriction_to_apply = entity_restriction is None and relation_restriction is None
        equal_factory_object = id(restricted_triples_factory) == id(
            original_triples_factory)
        assert no_restriction_to_apply == equal_factory_object

        # check that inverse_triples is correctly carried over
        assert original_triples_factory.create_inverse_triples == restricted_triples_factory.create_inverse_triples

        # verify that the label-to-ID mapping has not been changed
        assert original_triples_factory.entity_to_id == restricted_triples_factory.entity_to_id
        assert original_triples_factory.relation_to_id == restricted_triples_factory.relation_to_id

        # verify that triples have been filtered
        if entity_restriction is not None:
            present_entities = set(
                restricted_triples_factory.triples[:, 0]).union(
                    restricted_triples_factory.triples[:, 2])
            expected_entities = (set(
                original_triples_factory.entity_id_to_label.values(
                )).difference(entity_restriction) if invert_entity_selection
                                 else entity_restriction)
            assert expected_entities.issuperset(present_entities)

        if relation_restriction is not None:
            present_relations = set(restricted_triples_factory.triples[:, 1])
            expected_relations = (
                set(original_triples_factory.relation_id_to_label.values())
                if invert_relation_selection else set(relation_restriction))
            assert expected_relations.issuperset(present_relations)
Esempio n. 13
0
    def _load_helper(self, relative_path) -> TriplesFactory:
        relative_path = path.join(self.dataset_path, relative_path)

        with open(relative_path) as file:
            df = pd.read_csv(
                file,
                usecols=[
                    self.head_column, self.relation_column, self.tail_column
                ],
                header=self.header,
                sep=self.sep,
            )

            entity_to_id = None
            relation_to_id = None

            if self.entity_to_id_path:
                node_mapping = pd.read_csv(self.entity_to_id_path,
                                           sep=self.entity_to_id_sep,
                                           header=None)
                entity_to_id = {
                    label: id
                    for label, id in zip(
                        node_mapping[self.entity_to_id_label_col],
                        node_mapping[self.entity_to_id_id_col])
                }

            if self.relation_to_id_path:
                relation_mapping = pd.read_csv(self.relation_to_id_path,
                                               sep=self.relation_to_id_sep,
                                               header=None)
                relation_to_id = {
                    label: id
                    for label, id in zip(
                        relation_mapping[self.relation_to_id_label_col],
                        relation_mapping[self.relation_to_id_id_col])
                }

            rv = TriplesFactory.from_labeled_triples(
                triples=df.values,
                entity_to_id=entity_to_id,
                relation_to_id=relation_to_id)

            rv.path = relative_path
            return rv
Esempio n. 14
0
def deteriorate(
    reference: TriplesFactory,
    *others: TriplesFactory,
    n: Union[int, float],
    random_state: TorchRandomHint = None,
) -> List[TriplesFactory]:
    """Remove n triples from the reference set.

    TODO: take care that triples aren't removed that are the only ones with any given entity
    """
    if reference.create_inverse_triples:
        raise NotImplementedError

    if isinstance(n, float):
        if n < 0 or 1 <= n:
            raise ValueError
        n = int(n * reference.num_triples)

    generator = ensure_torch_random_state(random_state)
    logger.debug("random state %s", random_state)
    logger.debug("generator %s %s", generator, generator.get_state())
    idx = torch.randperm(reference.num_triples, generator=generator)
    logger.debug("idx %s", idx)
    reference_idx, deteriorated_idx = idx.split(split_size=[reference.num_triples - n, n], dim=0)

    first = reference.clone_and_exchange_triples(
        mapped_triples=reference.mapped_triples[reference_idx],
    )

    # distribute the deteriorated triples across the remaining factories
    didxs = deteriorated_idx.split(math.ceil(n / len(others)), dim=0)
    rest = [
        tf.clone_and_exchange_triples(
            mapped_triples=(
                torch.cat([tf.mapped_triples, reference.mapped_triples[didx]], dim=0)
                if didx is not None
                else tf.mapped_triples  # maybe just give same tf? should it be copied?
            ),
        )
        for didx, tf in zip_longest(didxs, others)
    ]

    return [first, *rest]
Esempio n. 15
0
 def _pre_instantiation_hook(self, kwargs: MutableMapping[str, Any]) -> MutableMapping[str, Any]:  # noqa: D102
     kwargs = super()._pre_instantiation_hook(kwargs=kwargs)
     # TODO: use triple generation
     # generate random triples
     mapped_triples = numpy.stack([
         numpy.random.randint(max_id, size=(self.num_triples,))
         for max_id in (self.num, self.num_relations, self.num)
     ], axis=-1)
     entity_names = [f"e_{i}" for i in range(self.num)]
     relation_names = [f"r_{i}" for i in range(self.num_relations)]
     triples = numpy.stack([
         [names[i] for i in col.tolist()]
         for col, names in zip(
             mapped_triples.T,
             (entity_names, relation_names, entity_names),
         )
     ])
     kwargs["triples_factory"] = TriplesFactory.from_labeled_triples(triples=triples)
     return kwargs
Esempio n. 16
0
    def test_custom_tf(self):
        """Test using a custom triples factories with HPO.

        .. seealso:: https://github.com/pykeen/pykeen/issues/230
        """
        tf = TriplesFactory.from_path(path=NATIONS_TRAIN_PATH)
        training, testing, validation = tf.split([.8, .1, .1], random_state=0)

        hpo_pipeline_result = hpo_pipeline(
            training=training,
            testing=testing,
            validation=validation,
            model='TransE',
            n_trials=2,
            training_kwargs=dict(num_epochs=2),
        )

        with tempfile.TemporaryDirectory() as directory:
            hpo_pipeline_result.save_to_directory(directory)
Esempio n. 17
0
 def test_automatic_incomplete_inverse_detection(self):
     """Test detecting that the triples contain inverses, warns about them, and filters them out."""
     # comment(mberr): from my pov this behaviour is faulty: the triples factory is expected to say it contains
     # inverse relations, although the triples contained in it are not the same we would have when removing the
     # first triple, and passing create_inverse_triples=True.
     t = [
         ['e3', f'a.{INVERSE_SUFFIX}', 'e10'],
         ['e1', 'a', 'e2'],
         ['e1', 'a.', 'e5'],
     ]
     t = np.array(t, dtype=np.str)
     for create_inverse_triples in (False, True):
         with patch("pykeen.triples.triples_factory.logger.warning") as warning:
             factory = TriplesFactory.from_labeled_triples(triples=t, create_inverse_triples=create_inverse_triples)
             # check for warning
             warning.assert_called()
             # check for filtered triples
             assert factory.num_triples == 2
             # check for correct inverse triples flag
             assert factory.create_inverse_triples == create_inverse_triples
Esempio n. 18
0
    def test_custom_tf_object(self):
        """Test using a custom triples factories with HPO.

        .. seealso:: https://github.com/pykeen/pykeen/issues/230
        """
        tf = TriplesFactory.from_path(path=NATIONS_TRAIN_PATH)
        training, testing, validation = tf.split([.8, .1, .1], random_state=0)

        hpo_pipeline_result = self._help_test_hpo(
            study_name='HPO with custom triples factories',
            training=training,
            testing=testing,
            validation=validation,
        )
        self.assertNotIn('dataset', hpo_pipeline_result.study.user_attrs)
        # Since there's no source path information, these shouldn't be
        # added, even if it might be possible to infer path information
        # from the triples factories
        self.assertNotIn('training', hpo_pipeline_result.study.user_attrs)
        self.assertNotIn('testing', hpo_pipeline_result.study.user_attrs)
        self.assertNotIn('validation', hpo_pipeline_result.study.user_attrs)
Esempio n. 19
0
    def test_lcwa_margin_ranking_loss_helper(self):
        """Test if output is correct for the LCWA training loop use case."""
        factory = TriplesFactory.from_labeled_triples(triples=self.triples)

        loss_cls = MarginRankingLoss(
            margin=0,
            reduction='sum',
        )

        model = TransE(
            triples_factory=factory,
            embedding_dim=8,
            preferred_device='cpu',
            loss=loss_cls,
        )

        loop = LCWATrainingLoop(model=model, triples_factory=factory)
        loss = loop._mr_loss_helper(predictions=self.predictions,
                                    labels=self.labels)
        self.assertEqual(14, loss)

        loss_cls = MarginRankingLoss(
            margin=0,
            reduction='mean',
        )

        model = TransE(
            triples_factory=factory,
            embedding_dim=8,
            preferred_device='cpu',
            loss=loss_cls,
        )

        loop = LCWATrainingLoop(model=model, triples_factory=factory)
        loss = loop._mr_loss_helper(predictions=self.predictions,
                                    labels=self.labels)
        self.assertEqual(1, loss)
Esempio n. 20
0
    def test_find_leak_assymetric(self):
        """Test finding test leakages with an asymmetric metric."""
        n = 100
        test_relation, test_relation_inverse = 'r', 'r_inverse'

        train_generated = list(
            itt.chain.from_iterable(([
                [str(i), test_relation, str(j + 1 + n)],
                [str(j + 1 + n), test_relation_inverse,
                 str(i)],
            ] for i, j in zip(range(n), range(n)))))
        train_non_inverses = [
            ['a', 'fine', 'b'],
            ['b', 'fine', 'c'],
        ]
        forwards_extras = [
            ['-1', test_relation, '-2'],  # this one leaks!
            ['-3', test_relation, '-4'],
        ]
        inverse_extras = [
            ['-5', test_relation_inverse, '-6'],
        ]
        train = train_generated + train_non_inverses + forwards_extras + inverse_extras
        test = [
            ['-2', test_relation_inverse, '-1'],  # this one was leaked!
        ]
        train_factory = TriplesFactory.from_labeled_triples(
            triples=np.array(train, dtype=np.str))
        test_factory = TriplesFactory.from_labeled_triples(
            triples=np.array(test, dtype=np.str))

        sealant = Sealant(train_factory, symmetric=False)

        expected_forwards_frequency = n / (n + len(forwards_extras))
        expected_inverse_frequency = n / (n + len(inverse_extras))
        self.assertGreater(len(forwards_extras), len(inverse_extras))
        self.assertLess(
            expected_forwards_frequency,
            expected_inverse_frequency,
            msg='Forwards frequency should be higher than inverse frequency',
        )
        self.assertEqual(
            {
                (test_relation, test_relation_inverse):
                expected_forwards_frequency,
                (test_relation_inverse, test_relation):
                expected_inverse_frequency,
            },
            dict(sealant.candidate_inverse_relations),
        )

        self.assertIn(test_relation, sealant.inverses)
        self.assertEqual(test_relation_inverse,
                         sealant.inverses[test_relation])
        self.assertIn(test_relation_inverse, sealant.inverses)
        self.assertEqual(test_relation,
                         sealant.inverses[test_relation_inverse])

        self.assertIn(
            test_relation_inverse,
            sealant.inverse_relations_to_delete,
            msg='The wrong relation was picked for deletion',
        )

        test_leaked = sealant.get_inverse_triples(test_factory)
        self.assertEqual(1, len(test_leaked))
        self.assertEqual(('-2', test_relation_inverse, '-1'),
                         tuple(test_leaked[0]))
Esempio n. 21
0
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

training_path: str = "kg/train.hrt.txt"
validation_path: str = "kg/valid.hrt.txt"
testing_path: str = "kg/test.hrt.txt"

training = TriplesFactory(path=training_path, )
valid = TriplesFactory(
    path=validation_path,
    entity_to_id=training.entity_to_id,
    relation_to_id=training.relation_to_id,
)
testing = TriplesFactory(
    path=testing_path,
    entity_to_id=training.entity_to_id,
    relation_to_id=training.relation_to_id,
)

result = pipeline(training=training,
                  validation=valid,
                  testing=testing,
                  model='TransE',
                  training_kwargs=dict(num_epochs=2, batch_size=512),
                  evaluation_kwargs=dict(batch_size=128))
result.save_to_directory('saved-model')

import torch
model = torch.load('saved-model/trained_model.pkl')
print(model.predict_heads('VARIANT_DISEASE_associated', 'Leigh_syndrome'))
Esempio n. 22
0
def do_kge(edgelist: pd.DataFrame,
           design: pd.DataFrame,
           out: str,
           model_config: Dict,
           return_patients: Optional[bool] = True,
           train_size: Optional[float] = 0.8,
           validation_size: Optional[float] = 0.1) -> pd.DataFrame:
    """Carry out KGE on the given data.

    :param edgelist: Dataframe containing the patient-feature graph in edgelist format
    :param design: Dataframe containing the design table for the data
    :param out: Output folder for the results
    :param model_config: Configuration file for the KGE models, in JSON format.
    :param return_patients: Flag to indicate if the final data should contain only patients or even the features
    :param train_size: Size of the training data for KGE ranging from 0 - 1
    :param validation_size: Size of the validation data for KGE ranging from 0 - 1. It must be lower than training size
    :return: Dataframe containing the embedding from the KGE
    """
    design_norm_df = design.astype(str, copy=True)

    unique_nodes = edgelist[~edgelist['label'].isna()].drop_duplicates(
        'source')

    label_mapping = {
        patient: label
        for patient, label in zip(unique_nodes['source'],
                                  unique_nodes['label'])
    }

    edgelist = edgelist.drop(columns='label')

    # Split the edgelist into training, validation and testing data
    train, validation, test = _weighted_splitter(
        edgelist=edgelist,
        train_size=train_size,
        validation_size=validation_size)

    train.to_csv(f'{out}/train.edgelist', sep='\t', index=False, header=False)
    validation.to_csv(f'{out}/validation.edgelist',
                      sep='\t',
                      index=False,
                      header=False)
    test.to_csv(f'{out}/test.edgelist', sep='\t', index=False, header=False)

    create_inverse_triples = False  # In a second HPO configuration, this can be set to true
    training_factory = TriplesFactory(
        path=f'{out}/train.edgelist',
        create_inverse_triples=create_inverse_triples,
    )
    validation_factory = TriplesFactory(
        path=f'{out}/validation.edgelist',
        create_inverse_triples=create_inverse_triples,
    )
    testing_factory = TriplesFactory(
        path=f'{out}/test.edgelist',
        create_inverse_triples=create_inverse_triples,
    )

    run_optimization(dataset=(training_factory, validation_factory,
                              testing_factory),
                     model_config=model_config,
                     out_dir=out)

    best_model = run_pipeline(dataset=(training_factory, validation_factory,
                                       testing_factory),
                              out_dir=out).model

    # Get the embedding as a numpy array
    embedding_values = _model_to_numpy(best_model)

    # Create columns as component names
    embedding_columns = [
        f'Component_{i}' for i in range(1, embedding_values.shape[1] + 1)
    ]

    # Get the nodes of the training triples as index
    node_list = list(best_model.triples_factory.entity_to_id.keys())
    embedding_index = sorted(
        node_list, key=lambda x: best_model.triples_factory.entity_to_id[x])

    embedding = pd.DataFrame(data=embedding_values,
                             columns=embedding_columns,
                             index=embedding_index)

    if return_patients:
        # TODO: Use clustering before classification to see if embeddings are already good enough
        embedding = embedding[embedding.index.isin(design_norm_df['FileName'])]

        for index in embedding.index:
            embedding.at[index, 'label'] = label_mapping[index]

    return embedding
Esempio n. 23
0
    def test_find_leak_assymetric(self):
        """Test finding test leakages with an asymmetric metric."""
        n = 100
        min_frequency = 0.97
        test_relation, test_relation_inverse = 'r', 'r_inverse'

        train_generated = list(
            itt.chain.from_iterable(([
                [str(i), test_relation, str(j + 1 + n)],
                [str(j + 1 + n), test_relation_inverse,
                 str(i)],
            ] for i, j in zip(range(n), range(n)))))
        train_non_inverses = [
            ['a', 'fine', 'b'],
            ['b', 'fine', 'c'],
        ]
        forwards_extras = [
            ['-1', test_relation, '-2'],  # this one leaks!
            ['-3', test_relation, '-4'],
        ]
        inverse_extras = [
            ['-5', test_relation_inverse, '-6'],
        ]
        train = train_generated + train_non_inverses + forwards_extras + inverse_extras
        test = [
            ['-2', test_relation_inverse, '-1'],  # this one was leaked!
        ]
        train_factory = TriplesFactory.from_labeled_triples(
            triples=np.array(train, dtype=np.str),
            filter_out_candidate_inverse_relations=False,
        )
        test_factory = TriplesFactory.from_labeled_triples(
            triples=np.array(test, dtype=np.str),
            entity_to_id=train_factory.entity_to_id,
            relation_to_id=train_factory.relation_to_id,
            filter_out_candidate_inverse_relations=False,
        )

        expected_forwards_frequency = n / (n + len(forwards_extras))
        expected_inverse_frequency = n / (n + len(inverse_extras))
        # expected_frequency = n / (n + len(forwards_extras) + len(inverse_extras))
        # self.assertLessEqual(min_frequency, expected_frequency)

        self.assertGreater(len(forwards_extras), len(inverse_extras))
        self.assertLess(
            expected_forwards_frequency,
            expected_inverse_frequency,
            msg='Forwards frequency should be higher than inverse frequency',
        )

        sealant = Sealant(train_factory,
                          symmetric=False,
                          minimum_frequency=min_frequency)
        test_relation_id, test_relation_inverse_id = [
            train_factory.relation_to_id[r]
            for r in (test_relation, test_relation_inverse)
        ]
        self.assertNotEqual(
            0,
            len(sealant.candidate_inverse_relations),
            msg=
            f'did not find any candidate inverse relations at frequency>={min_frequency}',
        )
        self.assertEqual(
            {
                (test_relation_id, test_relation_inverse_id):
                expected_forwards_frequency,
                (test_relation_inverse_id, test_relation_id):
                expected_inverse_frequency,
            },
            dict(sealant.candidate_inverse_relations),
        )

        self.assertIn(test_relation_id, sealant.inverses)
        self.assertEqual(test_relation_inverse_id,
                         sealant.inverses[test_relation])
        self.assertIn(test_relation_inverse_id, sealant.inverses)
        self.assertEqual(test_relation,
                         sealant.inverses[test_relation_inverse_id])

        self.assertIn(
            test_relation_inverse_id,
            sealant.inverse_relations_to_delete,
            msg='The wrong relation was picked for deletion',
        )

        # Test looking up inverse triples
        test_leaked = test_factory.mapped_triples[
            test_factory.get_mask_for_relations(
                relations=sealant.inverse_relations_to_delete, invert=False)]
        self.assertEqual(1, len(test_leaked))
        self.assertEqual(
            (train_factory.entity_to_id['-2'], test_relation_inverse,
             train_factory.entity_to_id['-1']),
            tuple(test_leaked[0]),
        )
Esempio n. 24
0
from pykeen.triples import TriplesFactory
from pykeen.evaluation import RankBasedEvaluator
from pykeen.pipeline import pipeline
import json

n_tokeep = 300
minimum = 500

tf = TriplesFactory.from_path(f'data/rare/rare_{minimum}_{n_tokeep}.csv')
training, testing = tf.split([.8, .2])

result_pipeline = pipeline(
    training=training,
    testing=testing,
    model='RESCAL',
    model_kwargs=dict(embedding_dim=300),
    training_kwargs=dict(  #sampler="schlichtkrull",
        # checkpoint_name='RGCN_checkpointt.pt',
        # checkpoint_frequency=5,
        num_epochs=200  #,
        #batch_size=128
    ),
    evaluator=RankBasedEvaluator,
    evaluator_kwargs=dict(ks=[50]))
result_pipeline.plot_losses()

result_pipeline.plot()