Exemple #1
0
class YagoTypeSimilarity(WordNetSimilarity):
    """Extend the WordNet synset to linked data through YAGO mappings"""
    def __init__(self,
                 graph_ic='models/yago_type_ic.txt',
                 mappings="models/type-linkings.txt"):
        WordNetSimilarity.__init__(self)
        self._graph_ic = GraphIC(graph_ic)
        self._mappings = FileIO.read_json_file(mappings)
        self._id2mappings = {data['offset']: data for data in self._mappings}
        self._yago2id = {
            data['yago_dbpedia']: data['offset']
            for data in self._mappings
        }

    def synset2id(self, synset):
        return str(synset.offset() + 100000000)

    def id2synset(self, offset):
        x = offset[1:]
        return wn._synset_from_pos_and_offset('n', int(x))

    def synset2mapping(self, synset, key):
        mapping_id = self.synset2id(synset)
        if mapping_id in self._id2mappings:
            mapping = self._id2mappings[mapping_id]
            return mapping[key] if key in mapping else None
        else:
            return None

    def synset2yago(self, synset):
        return self.synset2mapping(synset, 'yago_dbpedia')

    def synset2dbpedia(self, synset):
        return self.synset2mapping(synset, 'dbpedia')

    def yago2synset(self, yago):
        if yago in self._yago2id:
            return self.id2synset(self._yago2id[yago])
        return None

    def word2dbpedia(self, word):
        return [
            self.synset2dbpedia(s) for s in self.word2synset(word)
            if self.synset2dbpedia(s)
        ]

    def word2yago(self, word):
        return [
            self.synset2yago(s) for s in self.word2synset(word)
            if self.synset2yago(s)
        ]

    def yago_similarity(self, yago1, yago2, name='wpath'):
        """
        Compute semantic similarity of two yago concepts by mapping concept uri to wordnet synset.
        :param yago1: yago concept uri
        :param yago2: yago concept uri
        :param name: name of semantic similarity metric
        :return: semantic similarity score if both uri can be mapped to synsets, otherwise 0.
        """
        s1 = self.yago2synset(yago1)
        s2 = self.yago2synset(yago2)
        if s1 and s2:
            return self.similarity(s1, s2, name)
        return 0.0

    def word_similarity_wpath_graph(self, w1, w2, k):
        s1 = self.word2synset(w1)
        s2 = self.word2synset(w2)
        return max([self.wpath_graph(c1, c2, k) for c1 in s1
                    for c2 in s2] + [0])

    def res_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        yago = self.synset2yago(lcs)
        return self._graph_ic.concept_ic(yago)

    def lin_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        yago_c1 = self.synset2yago(c1)
        yago_c2 = self.synset2yago(c2)
        yago_lcs = self.synset2yago(lcs)
        lcs_ic = self._graph_ic.concept_ic(yago_lcs)
        c1_ic = self._graph_ic.concept_ic(yago_c1)
        c2_ic = self._graph_ic.concept_ic(yago_c2)
        combine = c1_ic + c2_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 2.0 * lcs_ic / combine

    def jcn_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        yago_c1 = self.synset2yago(c1)
        yago_c2 = self.synset2yago(c2)
        yago_lcs = self.synset2yago(lcs)
        lcs_ic = self._graph_ic.concept_ic(yago_lcs)
        c1_ic = self._graph_ic.concept_ic(yago_c1)
        c2_ic = self._graph_ic.concept_ic(yago_c2)
        lcs_ic = 2.0 * lcs_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 1.0 / 1 + (c1_ic + c2_ic - lcs_ic)

    def wpath_graph(self, c1, c2, k=0.9):
        lcs = self.least_common_subsumer(c1, c2)
        path = c1.shortest_path_distance(c2)
        yago_lcs = self.synset2yago(lcs)
        weight = k**self._graph_ic.concept_ic(yago_lcs)
        return 1.0 / (1 + path * weight)
Exemple #2
0
class YagoTypeSimilarity(WordNetSimilarity):

    """Extend the WordNet synset to linked data through YAGO mappings"""

    def __init__(self, graph_ic='models/yago_type_ic.txt', mappings="models/type-linkings.txt"):
        WordNetSimilarity.__init__(self)
        self._graph_ic = GraphIC(graph_ic)
        self._mappings = FileIO.read_json_file(mappings)
        self._id2mappings = {data['offset']: data for data in self._mappings}
        self._yago2id = {data['yago_dbpedia']: data['offset'] for data in self._mappings}

    def synset2id(self, synset):
        return str(synset.offset() + 100000000)

    def id2synset(self, offset):
        x = offset[1:]
        return wn._synset_from_pos_and_offset('n', int(x))

    def synset2mapping(self, synset, key):
        mapping_id = self.synset2id(synset)
        if mapping_id in self._id2mappings:
            mapping = self._id2mappings[mapping_id]
            return mapping[key] if key in mapping else None
        else:
            return None

    def synset2yago(self, synset):
        return self.synset2mapping(synset,'yago_dbpedia')

    def synset2dbpedia(self, synset):
        return self.synset2mapping(synset, 'dbpedia')

    def yago2synset(self, yago):
        if yago in self._yago2id:
            return self.id2synset(self._yago2id[yago])
        return None

    def word2dbpedia(self, word):
        return [self.synset2dbpedia(s) for s in self.word2synset(word) if self.synset2dbpedia(s)]

    def word2yago(self, word):
        return [self.synset2yago(s) for s in self.word2synset(word) if self.synset2yago(s)]

    def yago_similarity(self, yago1, yago2, name='wpath'):
        """
        Compute semantic similarity of two yago concepts by mapping concept uri to wordnet synset.
        :param yago1: yago concept uri
        :param yago2: yago concept uri
        :param name: name of semantic similarity metric
        :return: semantic similarity score if both uri can be mapped to synsets, otherwise 0.
        """
        s1 = self.yago2synset(yago1)
        s2 = self.yago2synset(yago2)
        if s1 and s2:
            return self.similarity(s1, s2, name)
        return 0.0

    def word_similarity_wpath_graph(self, w1, w2, k):
        s1 = self.word2synset(w1)
        s2 = self.word2synset(w2)
        return max([self.wpath_graph(c1, c2, k) for c1 in s1 for c2 in s2] + [0])

    def res_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1,c2)
        yago = self.synset2yago(lcs)
        return self._graph_ic.concept_ic(yago)

    def lin_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1,c2)
        yago_c1 = self.synset2yago(c1)
        yago_c2 = self.synset2yago(c2)
        yago_lcs = self.synset2yago(lcs)
        lcs_ic = self._graph_ic.concept_ic(yago_lcs)
        c1_ic = self._graph_ic.concept_ic(yago_c1)
        c2_ic = self._graph_ic.concept_ic(yago_c2)
        combine = c1_ic + c2_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 2.0 * lcs_ic / combine

    def jcn_graph(self, c1, c2):
        lcs = self.least_common_subsumer(c1,c2)
        yago_c1 = self.synset2yago(c1)
        yago_c2 = self.synset2yago(c2)
        yago_lcs = self.synset2yago(lcs)
        lcs_ic = self._graph_ic.concept_ic(yago_lcs)
        c1_ic = self._graph_ic.concept_ic(yago_c1)
        c2_ic = self._graph_ic.concept_ic(yago_c2)
        lcs_ic = 2.0 * lcs_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 1.0 / 1+(c1_ic + c2_ic - lcs_ic)

    def wpath_graph(self, c1, c2, k=0.9):
        lcs = self.least_common_subsumer(c1, c2)
        path = c1.shortest_path_distance(c2)
        yago_lcs = self.synset2yago(lcs)
        weight = k ** self._graph_ic.concept_ic(yago_lcs)
        return 1.0 / (1 + path*weight)
Exemple #3
0
class ConceptSimilarity:
    """
    This class is used to compute taxonomical semantic similarity scores between
    concepts that are located in a concept taxonomy. A taxonomy object needs to be passed into
    this class in order to find the structural information of concepts such as depth, path length,
    and so on. The graph-based IC is needed for semantic similarity measures wpath, res, lin, jcn.
    """
    def __init__(self, taxonomy, ic_file):
        self._taxonomy = taxonomy
        self._concepts = taxonomy._nodes
        self._concept2node = taxonomy._node2id
        self._label2concepts = {
            label: self._concepts[i]
            for i, label in enumerate(taxonomy._labels)
        }
        self._graph_ic = GraphIC(ic_file)

    def hyponyms(self, concept):
        if concept in self._concept2node:
            nodes = self._taxonomy.hyponyms(self._concept2node[concept])
            return [self._concepts[n] for n in nodes]
        return []

    def hypernyms(self, concept):
        if concept in self._concept2node:
            nodes = self._taxonomy.hypernyms(self._concept2node[concept])
            return [self._concepts[n] for n in nodes]
        return []

    def shortest_path_length(self, concept1, concept2):
        n1 = self._concept2node[concept1]
        n2 = self._concept2node[concept2]
        return self._taxonomy.shortest_path_length(n1, n2)

    def depth(self, concept):
        if concept == 'root':
            return 1
        n = self._concept2node[concept]
        return self._taxonomy.depth(n)

    def least_common_subsumer(self, concept1, concept2):
        n1 = self._concept2node[concept1]
        n2 = self._concept2node[concept2]
        n = self._taxonomy.least_common_subsumer(n1, n2)
        if n > len(self._concepts):
            return 'root'
        return self._concepts[n]

    def method(self, name):
        def function(c1, c2):
            score = getattr(self, name)(c1, c2)
            return abs(score)

        return function

    def name2concept(self, name):
        return self._label2concepts[
            name] if name in self._label2concepts else []

    def concept_ic(self, concept):
        """
        Get the graph-based IC of a concept. the ic of virtual root is 0
        :param concept: the node id of concept
        :return: the ic value of concept
        """
        if concept == 'root':
            return 0.0
        else:
            return self._graph_ic.concept_ic(concept)

    @memoized
    def similarity(self, c1, c2, name='wpath'):
        """
        Compute semantic similarity between two concepts
        :param c1:
        :param c2:
        :param name:
        :return:
        """
        if c1 not in self._concept2node or c2 not in self._concept2node:
            return 'link error'
        return self.method(name)(c1, c2)

    def path(self, c1, c2):
        """
        Rada's shortest path based similarity metric
        :param c1:
        :param c2:
        :return: similarity score in [0,1]
        """
        return 1.0 / self.shortest_path_length(c1, c2)

    def wup(self, c1, c2):
        """
        Wu and Palm's similarity metric
        :param c1:
        :param c2:
        :return:
        """
        lcs = self.least_common_subsumer(c1, c2)
        depth_c1 = self.depth(c1)
        depth_c2 = self.depth(c2)
        depth_lcs = self.depth(lcs)
        return 2.0 * depth_lcs / (depth_c1 + depth_c2)

    def li(self, c1, c2, alpha=0.2, beta=0.6):
        path = self.shortest_path_length(c1, c2) - 1
        lcs = self.least_common_subsumer(c1, c2)
        depth = self.depth(lcs)
        # print path, lcs, depth
        x = math.exp(-alpha * path)
        y = math.exp(beta * depth)
        # print y
        z = math.exp(-beta * depth)
        a = y - z
        b = y + z
        return x * (a / b)

    def res(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        return self.concept_ic(lcs)

    def lin(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        lcs_ic = self.concept_ic(lcs)
        c1_ic = self.concept_ic(c1)
        c2_ic = self.concept_ic(c2)
        combine = c1_ic + c2_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 2.0 * lcs_ic / combine

    def jcn(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        lcs_ic = self.concept_ic(lcs)
        c1_ic = self.concept_ic(c1)
        c2_ic = self.concept_ic(c2)
        lcs_ic = 2.0 * lcs_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 1.0 / 1 + (c1_ic + c2_ic - lcs_ic)

    def wpath(self, c1, c2, k=0.8):
        lcs = self.least_common_subsumer(c1, c2)
        path = self.shortest_path_length(c1, c2) - 1
        weight = k**self.concept_ic(lcs)
        return 1.0 / (1 + path * weight)
Exemple #4
0
class ConceptSimilarity:
    """
    This class is used to compute taxonomical semantic similarity scores between
    concepts that are located in a concept taxonomy. A taxonomy object needs to be passed into
    this class in order to find the structural information of concepts such as depth, path length,
    and so on. The graph-based IC is needed for semantic similarity measures wpath, res, lin, jcn.
    """
    def __init__(self, taxonomy, ic_file):
        self._taxonomy = taxonomy
        self._concepts = taxonomy._nodes
        self._concept2node = taxonomy._node2id
        self._label2concepts = {label:self._concepts[i] for i, label in enumerate(taxonomy._labels)}
        self._graph_ic = GraphIC(ic_file)

    def hyponyms(self, concept):
        if concept in self._concept2node:
            nodes = self._taxonomy.hyponyms(self._concept2node[concept])
            return [self._concepts[n] for n in nodes]
        return []

    def hypernyms(self, concept):
        if concept in self._concept2node:
            nodes = self._taxonomy.hypernyms(self._concept2node[concept])
            return [self._concepts[n] for n in nodes]
        return []

    def shortest_path_length(self, concept1, concept2):
        n1 = self._concept2node[concept1]
        n2 = self._concept2node[concept2]
        return self._taxonomy.shortest_path_length(n1, n2)

    def depth(self, concept):
        if concept == 'root':
            return 1
        n = self._concept2node[concept]
        return self._taxonomy.depth(n)

    def least_common_subsumer(self, concept1, concept2):
        n1 = self._concept2node[concept1]
        n2 = self._concept2node[concept2]
        n = self._taxonomy.least_common_subsumer(n1, n2)
        if n > len(self._concepts):
            return 'root'
        return self._concepts[n]

    def method(self, name):
        def function(c1, c2):
            score = getattr(self, name)(c1, c2)
            return abs(score)
        return function

    def name2concept(self, name):
        return self._label2concepts[name] if name in self._label2concepts else []

    def concept_ic(self, concept):
        """
        Get the graph-based IC of a concept. the ic of virtual root is 0
        :param concept: the node id of concept
        :return: the ic value of concept
        """
        if concept == 'root':
            return 0.0
        else:
            return self._graph_ic.concept_ic(concept)

    @memoized
    def similarity(self, c1, c2, name='wpath'):
        """
        Compute semantic similarity between two concepts
        :param c1:
        :param c2:
        :param name:
        :return:
        """
        if c1 not in self._concept2node or c2 not in self._concept2node:
            return 'link error'
        return self.method(name)(c1, c2)

    def path(self, c1, c2):
        """
        Rada's shortest path based similarity metric
        :param c1:
        :param c2:
        :return: similarity score in [0,1]
        """
        return 1.0/ self.shortest_path_length(c1, c2)

    def wup(self, c1, c2):
        """
        Wu and Palm's similarity metric
        :param c1:
        :param c2:
        :return:
        """
        lcs = self.least_common_subsumer(c1, c2)
        depth_c1 = self.depth(c1)
        depth_c2 = self.depth(c2)
        depth_lcs = self.depth(lcs)
        return 2.0*depth_lcs / (depth_c1 + depth_c2)

    def li(self, c1, c2, alpha=0.2, beta=0.6):
        path = self.shortest_path_length(c1, c2) - 1
        lcs = self.least_common_subsumer(c1, c2)
        depth = self.depth(lcs)
        # print path, lcs, depth
        x = math.exp(-alpha * path)
        y = math.exp(beta * depth)
        # print y
        z = math.exp(-beta * depth)
        a = y - z
        b = y + z
        return x * (a / b)

    def res(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        return self.concept_ic(lcs)

    def lin(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        lcs_ic = self.concept_ic(lcs)
        c1_ic = self.concept_ic(c1)
        c2_ic = self.concept_ic(c2)
        combine = c1_ic + c2_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 2.0 * lcs_ic / combine

    def jcn(self, c1, c2):
        lcs = self.least_common_subsumer(c1, c2)
        lcs_ic = self.concept_ic(lcs)
        c1_ic = self.concept_ic(c1)
        c2_ic = self.concept_ic(c2)
        lcs_ic = 2.0 * lcs_ic
        if c1_ic == 0.0 or c2_ic == 0.0:
            return 0.0
        return 1.0 / 1 + (c1_ic + c2_ic - lcs_ic)

    def wpath(self, c1, c2, k=0.8):
        lcs = self.least_common_subsumer(c1, c2)
        path = self.shortest_path_length(c1, c2) - 1
        weight = k ** self.concept_ic(lcs)
        return 1.0 / (1 + path * weight)