def __init__(self,
                 df=None,
                 kg=None,
                 ent2ix=None,
                 rel2ix=None,
                 dict_of_heads=None,
                 dict_of_tails=None):
        """
        :param df: `pandas.DataFrame`
        :param kg: dict
            keys should be exhaustively ('heads', 'tails', 'relations')
        :param ent2ix:
        :param rel2ix:
        :param dict_of_heads:
        :param dict_of_tails:
        """

        if df is None:
            if kg is None:
                raise WrongArgumentsError(
                    "Please provide at least one argument of `df` and kg`")
            else:
                try:
                    assert (type(kg) == dict) & ('heads' in kg.keys()) & ('tails' in kg.keys()) & \
                           ('relations' in kg.keys())
                except AssertionError:
                    raise WrongArgumentsError(
                        "Keys in the `kg` dict should contain `heads`, `tails`, `relations`."
                    )
                try:
                    assert (rel2ix is not None) & (ent2ix is not None)
                except AssertionError:
                    raise WrongArgumentsError(
                        "Please provide the two dictionaries ent2ix and rel2ix if building from `kg`."
                    )
        else:
            if kg is not None:
                raise WrongArgumentsError(
                    "`df` and kg` arguments should not both provided.")

        if ent2ix is None:
            self.ent2ix = get_dictionaries(df, ent=True)
        else:
            self.ent2ix = ent2ix

        if rel2ix is None:
            self.rel2ix = get_dictionaries(df, ent=False)
        else:
            self.rel2ix = rel2ix

        self.n_ent = max(self.ent2ix.values()) + 1
        self.n_rel = max(self.rel2ix.values()) + 1

        if df is not None:
            # build kg from a pandas dataframe
            self.n_facts = len(df)
            self.head_idx = tensor(df['from'].map(self.ent2ix).values).long()
            self.tail_idx = tensor(df['to'].map(self.ent2ix).values).long()
            self.relations = tensor(df['rel'].map(self.rel2ix).values).long()
        else:
            # build kg from another kg
            self.n_facts = kg['heads'].shape[0]
            self.head_idx = kg['heads']
            self.tail_idx = kg['tails']
            self.relations = kg['relations']

        if dict_of_heads is None or dict_of_tails is None:
            self.dict_of_heads = defaultdict(set)
            self.dict_of_tails = defaultdict(set)
            self.evaluate_dicts()

        else:
            self.dict_of_heads = dict_of_heads
            self.dict_of_tails = dict_of_tails
        try:
            self.sanity_check()
        except AssertionError:
            raise SanityError("Please check the sanity of arguments.")
Ejemplo n.º 2
0
    def __init__(self,
                 df=None,
                 kg=None,
                 ent2ix=None,
                 rel2ix=None,
                 dict_of_heads=None,
                 dict_of_tails=None,
                 dict_of_rel=None,
                 id2point=None,
                 geo=None):

        if df is None:
            if kg is None:
                raise WrongArgumentsError("Please provide at least one "
                                          "argument of `df` and kg`")
            else:
                try:
                    assert (type(kg) == dict) & ('heads' in kg.keys()) & \
                           ('tails' in kg.keys()) & \
                           ('relations' in kg.keys())
                except AssertionError:
                    raise WrongArgumentsError("Keys in the `kg` dict should "
                                              "contain `heads`, `tails`, "
                                              "`relations`.")
                try:
                    assert (rel2ix is not None) & (ent2ix is not None)
                except AssertionError:
                    raise WrongArgumentsError("Please provide the two "
                                              "dictionaries ent2ix and rel2ix "
                                              "if building from `kg`.")
        else:
            if kg is not None:
                raise WrongArgumentsError("`df` and kg` arguments should not "
                                          "both be provided.")

        if ent2ix is None:
            self.ent2ix = get_dictionaries(df, ent=True)
        else:
            self.ent2ix = ent2ix

        if rel2ix is None:
            self.rel2ix = get_dictionaries(df, ent=False)
        else:
            self.rel2ix = rel2ix

        if id2point is not None:
            self.id2point = id2point

        self.n_ent = max(self.ent2ix.values()) + 1
        self.n_rel = max(self.rel2ix.values()) + 1
        self.geo = geo

        if df is not None:
            # build kg from a pandas dataframe
            self.n_facts = len(df)
            self.head_idx = tensor(df['from'].map(self.ent2ix).values).long()
            self.tail_idx = tensor(df['to'].map(self.ent2ix).values).long()
            self.relations = tensor(df['rel'].map(self.rel2ix).values).long()
        else:
            # build kg from another kg
            self.n_facts = kg['heads'].shape[0]
            self.head_idx = kg['heads']
            self.tail_idx = kg['tails']
            self.relations = kg['relations']
            try:
                self.point = kg['point']
            except:
                pass

        if (geo is not None) and (df is not None):  # Geo
            self.entity2point, self.id2point = self.load_point(geo)
            self.point = np.array([[
                self.entity2point[triplet[0]], self.entity2point[triplet[2]]
            ] for triplet in df.values])

        if dict_of_heads is None or dict_of_tails is None or dict_of_rel is None:
            self.dict_of_heads = defaultdict(set)
            self.dict_of_tails = defaultdict(set)
            self.dict_of_rel = defaultdict(set)
            self.evaluate_dicts()

        else:
            self.dict_of_heads = dict_of_heads
            self.dict_of_tails = dict_of_tails
            self.dict_of_rel = dict_of_rel
        try:
            self.sanity_check()
        except AssertionError:
            raise SanityError("Please check the sanity of arguments.")
    def split_kg(self, share=0.8, sizes=None, validation=False):
        """Split the knowledge graph into train and test. If `sizes` is provided then it is used to split the
        samples as explained below. If only `share` is provided, the split is done at random but it assures to keep at
        least one fact involving each type of entity and relation in the training subset.

        Parameters
        ----------
        share: float
            Percentage to allocate to train set.
        sizes: tuple
            Tuple of ints of length 2 or 3. If len(sizes) == 2, then the first sizes[0] values of\
            the knowledge graph will be used as training set and the rest as test set.\
            If len(sizes) == 3, the first sizes[0] values of the knowledge graph will be used as\
            training set, the following sizes[1] as validation set and the last sizes[2] as testing\
            set.
        validation: bool
            Indicate if a validation set should be produced along with train and test sets.

        Returns
        -------
        train_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph`
        val_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph`, optional
        test_kg: `torchkge.data.KnowledgeGraph.KnowledgeGraph`

        """
        # TODO: assert that all relations in test appear as well in validation (for triplet classification)
        if sizes is not None:
            try:
                if len(sizes) == 3:
                    try:
                        assert (sizes[0] + sizes[1] + sizes[2] == self.n_facts)
                    except AssertionError:
                        raise WrongArgumentsError(
                            'Sizes should sum to the number of facts.')
                elif len(sizes) == 2:
                    try:
                        assert (sizes[0] + sizes[1] == self.n_facts)
                    except AssertionError:
                        raise WrongArgumentsError(
                            'Sizes should sum to the number of facts.')
                else:
                    raise SizeMismatchError(
                        'Tuple `sizes` should be of length 2 or 3.')
            except AssertionError:
                raise SizeMismatchError(
                    'Tuple `sizes` should sum up to the number of facts in the '
                    'knowledge graph.')
        else:
            assert share < 1

        if ((sizes is not None) and
            (len(sizes) == 3)) or ((sizes is None) and validation):
            # return training, validation and a testing graphs

            if (sizes is None) and validation:
                mask_tr, mask_val, mask_te = self.get_mask(share,
                                                           validation=True)
            else:
                mask_tr = cat([
                    tensor([1 for _ in range(sizes[0])]),
                    tensor([0 for _ in range(sizes[1] + sizes[2])])
                ]).bool()
                mask_val = cat([
                    tensor([0 for _ in range(sizes[0])]),
                    tensor([1 for _ in range(sizes[1])]),
                    tensor([0 for _ in range(sizes[2])])
                ]).bool()
                mask_te = ~(mask_tr | mask_val)

            return KnowledgeGraph(
                kg={
                    'heads': self.head_idx[mask_tr],
                    'tails': self.tail_idx[mask_tr],
                    'relations': self.relations[mask_tr]
                },
                ent2ix=self.ent2ix,
                rel2ix=self.rel2ix,
                dict_of_heads=self.dict_of_heads,
                dict_of_tails=self.dict_of_tails), KnowledgeGraph(
                    kg={
                        'heads': self.head_idx[mask_val],
                        'tails': self.tail_idx[mask_val],
                        'relations': self.relations[mask_val]
                    },
                    ent2ix=self.ent2ix,
                    rel2ix=self.rel2ix,
                    dict_of_heads=self.dict_of_heads,
                    dict_of_tails=self.dict_of_tails), KnowledgeGraph(
                        kg={
                            'heads': self.head_idx[mask_te],
                            'tails': self.tail_idx[mask_te],
                            'relations': self.relations[mask_te]
                        },
                        ent2ix=self.ent2ix,
                        rel2ix=self.rel2ix,
                        dict_of_heads=self.dict_of_heads,
                        dict_of_tails=self.dict_of_tails)
        else:
            # return training and testing graphs

            assert (((sizes is not None) and len(sizes) == 2)
                    or ((sizes is None) and not validation))
            if sizes is None:
                mask_tr, mask_te = self.get_mask(share, validation=False)
            else:
                mask_tr = cat([
                    tensor([1 for _ in range(sizes[0])]),
                    tensor([0 for _ in range(sizes[1])])
                ]).bool()
                mask_te = ~mask_tr
            return KnowledgeGraph(
                kg={
                    'heads': self.head_idx[mask_tr],
                    'tails': self.tail_idx[mask_tr],
                    'relations': self.relations[mask_tr]
                },
                ent2ix=self.ent2ix,
                rel2ix=self.rel2ix,
                dict_of_heads=self.dict_of_heads,
                dict_of_tails=self.dict_of_tails), KnowledgeGraph(
                    kg={
                        'heads': self.head_idx[mask_te],
                        'tails': self.tail_idx[mask_te],
                        'relations': self.relations[mask_te]
                    },
                    ent2ix=self.ent2ix,
                    rel2ix=self.rel2ix,
                    dict_of_heads=self.dict_of_heads,
                    dict_of_tails=self.dict_of_tails)
Ejemplo n.º 4
0
    def __init__(
        self,
        df=None,
        kg=None,
        ent2ix=None,
        rel2ix=None,
        dict_of_heads=None,
        dict_of_tails=None,
    ):

        if df is None:
            if kg is None:
                raise WrongArgumentsError(
                    "Please provide at least one " "argument of `df` and kg`"
                )
            else:
                try:
                    assert (
                        (type(kg) == dict)
                        & ("heads" in kg.keys())
                        & ("tails" in kg.keys())
                        & ("relations" in kg.keys())
                    )
                except AssertionError:
                    raise WrongArgumentsError(
                        "Keys in the `kg` dict should "
                        "contain `heads`, `tails`, "
                        "`relations`."
                    )
                try:
                    assert (rel2ix is not None) & (ent2ix is not None)
                except AssertionError:
                    raise WrongArgumentsError(
                        "Please provide the two "
                        "dictionaries ent2ix and rel2ix "
                        "if building from `kg`."
                    )
        else:
            if kg is not None:
                raise WrongArgumentsError(
                    "`df` and kg` arguments should not " "both be provided."
                )

        if ent2ix is None:
            self.ent2ix = get_dictionaries(df, ent=True)
        else:
            self.ent2ix = ent2ix

        if rel2ix is None:
            self.rel2ix = get_dictionaries(df, ent=False)
        else:
            self.rel2ix = rel2ix

        self.n_ent = max(self.ent2ix.values()) + 1
        self.n_rel = max(self.rel2ix.values()) + 1

        if df is not None:
            # build kg from a pandas dataframe
            self.n_facts = len(df)
            self.head_idx = tensor(df["from"].map(self.ent2ix).values).long()
            self.tail_idx = tensor(df["to"].map(self.ent2ix).values).long()
            self.relations = tensor(df["rel"].map(self.rel2ix).values).long()
            self.magnitudes = tensor(df["how-much"], dtype=float64)

        else:
            # build kg from another kg
            self.n_facts = kg["heads"].shape[0]
            self.head_idx = kg["heads"]
            self.tail_idx = kg["tails"]
            self.relations = kg["relations"]
            self.magnitudes = kg["magnitudes"]


        if dict_of_heads is None or dict_of_tails is None:
            self.dict_of_heads = defaultdict(set)
            self.dict_of_tails = defaultdict(set)
            self.evaluate_dicts()

        else:
            self.dict_of_heads = dict_of_heads
            self.dict_of_tails = dict_of_tails
        try:
            self.sanity_check()
        except AssertionError:
            raise SanityError("Please check the sanity of arguments.")