Ejemplo n.º 1
0
 def __init__(self, result_limit=5000, expansion=True, show_query=False):
     self._expansion = expansion
     self._show_query = show_query
     self._linker = NameSPARQL()
     self._extracter = Extraction()
     self._yago = YagoTypeSimilarity()
     self._query_graph = QueryGraph(result_limit)
    def __init__(self, result_limit=5000, expansion=False, show_query=False):
        """ semantic search of entities and concepts

        :param result_limit: maximumn number of retrieved entities
        :param expansion: if conduct concept expansion
        :param show_query: if SPARQL query is shown
        """
        self._expansion = expansion
        self._show_query = show_query
        self._linker = NameSPARQL()
        self._extracter = Extraction()
        self._yago = YagoTypeSimilarity()
        self._query_graph = QueryGraph(result_limit)
Ejemplo n.º 3
0
 def __init__(self, result_limit=5000, expansion=True, show_query=False):
     self._expansion = expansion
     self._show_query = show_query
     self._linker = NameSPARQL()
     self._extracter = Extraction()
     self._yago = YagoTypeSimilarity()
     self._query_graph = QueryGraph(result_limit)
Ejemplo n.º 4
0
    simfile.write('\n\n')

sim_ref = np.array(contents)[:, 2].astype(float) / 4.0
corr = pearson_correlation(sim_cal, sim_ref)

with open('results.txt', 'a') as resfile:
    resfile.write(
        'pearson correlation in dataset [%s] for FastText embedding is %f\n' %
        ('STS-131', corr))

# part 8
with open('datasets/stss-131.csv', newline='') as csvfile:
    contents = list(csv.reader(csvfile, delimiter=';'))

from sematch.semantic.similarity import YagoTypeSimilarity
sim = YagoTypeSimilarity()
sim_cal = np.array(sentence_similarity_dataset_yago(contents,
                                                    sim)).reshape(-1, )

with open('sentence_similarity.txt', 'a') as simfile:
    simfile.write('Using Yago concepts\n')
    simfile.write('s1; s2; human_sim; method_sim\n\n')
    for i, pair in enumerate(contents):
        simfile.write('%s;%s;%s;%f\n' %
                      (pair[0], pair[1], pair[2], sim_cal[i] * 4))
    simfile.write('\n\n')

sim_ref = np.array(contents)[:, 2].astype(float) / 4.0
corr = pearson_correlation(sim_cal, sim_ref)

with open('results.txt', 'a') as resfile:
Ejemplo n.º 5
0
def test_yagotype_similarity():
    from sematch.semantic.similarity import YagoTypeSimilarity
    yagosim = YagoTypeSimilarity()
    dancer = yagosim.word2yago('dancer')
    actor = yagosim.word2yago('actor')
    singer = yagosim.word2yago('singer')
    assert yagosim.yago2synset(actor[0]) is not None
    assert yagosim.yago_similarity(dancer[0], actor[0], 'wpath') is not None
    assert yagosim.yago_similarity(singer[0], actor[0], 'wpath') is not None
    assert yagosim.word2yago('university') is not None
    assert yagosim.yago2synset('http://dbpedia.org/class/yago/EducationalInstitution108276342') is not None
    assert yagosim.yago2synset('http://dbpedia.org/class/yago/Organization108008335') is not None
    assert yagosim.yago2synset('http://dbpedia.org/class/yago/Institution108053576') is not None
    assert yagosim.yago2synset('http://dbpedia.org/class/yago/Organization108008335') is not None
    #using corpus-based IC from brown corpus
    assert yagosim.word_similarity('dancer', 'actor', 'wpath') is not None
    #using graph-based IC from DBpedia
    assert yagosim.word_similarity('dancer', 'actor', 'wpath_graph') is not None
Ejemplo n.º 6
0
 def __init__(self):
     self._linker = NameSPARQL()
     self._extracter = Extraction()
     self._yago = YagoTypeSimilarity()
     self._query_graph = QueryGraph()
class Matcher:

    """This class is used for concept based entity match in DBpedia"""

    def __init__(self, result_limit=5000, expansion=False, show_query=False):
        """ semantic search of entities and concepts

        :param result_limit: maximumn number of retrieved entities
        :param expansion: if conduct concept expansion
        :param show_query: if SPARQL query is shown
        """
        self._expansion = expansion
        self._show_query = show_query
        self._linker = NameSPARQL()
        self._extracter = Extraction()
        self._yago = YagoTypeSimilarity()
        self._query_graph = QueryGraph(result_limit)

    def type_links(self, word, lang='eng'):
        synsets = self._yago.multilingual2synset(word, lang=lang)
        if self._expansion:
            synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets])))
        links = []
        for s in synsets:
            link_dic = {}
            link_dic['name'] = s.name()
            link_dic['gloss'] = s._definition
            link_dic['lemma'] = ' '.join(s._lemma_names)
            concept_link = []
            yago_link = self._yago.synset2yago(s)
            dbpedia_link = self._yago.synset2dbpedia(s)
            concept_link.append(yago_link) if yago_link else None
            concept_link.append(dbpedia_link) if dbpedia_link else None
            link_dic['lod'] = concept_link
            if link_dic['lod']:
                links.append(link_dic)
        return links

    def query_process(self, query):
        """
        Process query into concept (common noun) and entity (proper noun). Link them
        to Knowledge Graph uri links respectively.
        :param query: short text query
        :return: tuple of concepts and entities in uris.
        """
        entities = self._extracter.extract_chunks_sent(query)
        entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities]))
        entity_filter = set(entity_filter)
        concepts = list(set(self._extracter.extract_nouns(query)))
        concepts = [c for c in concepts if c not in entity_filter]
        concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts]
        concept_uris = list(itertools.chain.from_iterable(concept_uris))
        entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities)))
        return list(set(concept_uris)), list(set(entity_uris))

    def match_concepts(self, concepts, lang='en'):
        results = []
        for i in xrange(0, len(concepts), 5):
            results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query))
        result_dic = {}
        for res in results:
            if res['uri'] not in result_dic:
                result_dic[res['uri']] = res
        return [result_dic[key] for key in result_dic.keys()]

    def match_type(self, query, lang='eng'):
        lang_map = {'eng':'en','spa':'es', 'cmn':'zh'}
        result_lang = lang_map[lang]
        words = query.split()
        concept_uris = []
        for w in words:
            concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)]))
            concept_uris.extend(concepts)
        concept_uris = list(set(concept_uris))
        return self.match_concepts(concept_uris, result_lang)

    def match_entity_type(self, query):
        results = []
        concepts, entities = self.query_process(query)
        for e in entities:
            for i in xrange(0, len(concepts), 5):
                results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query))
        result_dic = {}
        for res in results:
            if res['uri'] not in result_dic:
                result_dic[res['uri']] = res
        result = [result_dic[key] for key in result_dic.keys()]
        return result
Ejemplo n.º 8
0
def test_yago_concept_similarity():
    from sematch.semantic.similarity import YagoTypeSimilarity
    yagosim = YagoTypeSimilarity()
    dancer = yagosim.word2yago('dancer')
    actor = yagosim.word2yago('actor')
    singer = yagosim.word2yago('singer')
    assert yagosim.yago2synset(actor[0]) is not None
    assert yagosim.yago_similarity(dancer[0], actor[0], 'wpath') is not None
    assert yagosim.yago_similarity(singer[0], actor[0], 'wpath') is not None
    assert yagosim.word2yago('university') is not None
    assert yagosim.yago2synset(
        'http://dbpedia.org/class/yago/EducationalInstitution108276342'
    ) is not None
    assert yagosim.yago2synset(
        'http://dbpedia.org/class/yago/Organization108008335') is not None
    assert yagosim.yago2synset(
        'http://dbpedia.org/class/yago/Institution108053576') is not None
    assert yagosim.yago2synset(
        'http://dbpedia.org/class/yago/Organization108008335') is not None
    #using corpus-based IC from brown corpus
    assert yagosim.word_similarity('dancer', 'actor', 'wpath') is not None
    #using graph-based IC from DBpedia
    assert yagosim.word_similarity('dancer', 'actor',
                                   'wpath_graph') is not None
Ejemplo n.º 9
0
 def __init__(self):
     self._yago = YagoTypeSimilarity()
Ejemplo n.º 10
0
class WordSimDataset:
    """
    This class is used to prepare and separate word similarity datasets.
    """

    def __init__(self):
        self._yago = YagoTypeSimilarity()

    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = list(map(float, map(lambda x: x.split()[2], data)))
        return word_pairs, human

    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name))
        data = list(map(float, data))
        return data[0], data[1:]

    def save_result(self, cor, sim_values, sim_name, dataset_name):
        """
        This function save the result computed by a similarity metric
        :param cor: correlation with human rating
        :param sim_values: similarity scores for word pairs
        :param sim_name: the name of similarity metric
        :param dataset_name: the name of word similarity dataset
        :return:
        """
        data = ["%.3f" % cor]
        data += map(lambda x: "%.3f" % x, sim_values)
        FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)


    def check_word_graph(self, w1, w2):
        """
         check if lcs word is used as type in DBpedia

        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        lcs = self._yago.least_common_subsumer(s1, s2)
        yago_concept = self._yago.synset2yago(lcs)
        graph_ic = self._yago._graph_ic.concept_ic(yago_concept)
        return True if graph_ic else False


    def check_word_type(self, w1, w2):
        """
        check if both words are used as type in DBpedia
        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        yago_concept_1 = self._yago.synset2yago(s1)
        yago_concept_2 = self._yago.synset2yago(s2)
        graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1)
        graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2)
        return True if graph_ic_1 and graph_ic_2 else False


    def check_word_noun(self, w1, w2):
        """
        check if both words are in WordNet Noun Taxonomy
        :param w1:
        :param w2:
        :return:
        """
        s1 = self._yago.word2synset(w1)
        s2 = self._yago.word2synset(w2)
        return True if s1 and s2 else False

    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
Ejemplo n.º 11
0
 def __init__(self):
     self._yago = YagoTypeSimilarity()
Ejemplo n.º 12
0
class WordSimDataset:
    """
    This class is used to prepare and separate word similarity datasets.
    """

    def __init__(self):
        self._yago = YagoTypeSimilarity()

    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = map(float, map(lambda x: x.split()[2], data))
        return word_pairs, human

    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name))
        data = map(float, data)
        return data[0], data[1:]

    def save_result(self, cor, sim_values, sim_name, dataset_name):
        """
        This function save the result computed by a similarity metric
        :param cor: correlation with human rating
        :param sim_values: similarity scores for word pairs
        :param sim_name: the name of similarity metric
        :param dataset_name: the name of word similarity dataset
        :return:
        """
        data = ["%.3f" % cor]
        data += map(lambda x: "%.3f" % x, sim_values)
        FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)


    def check_word_graph(self, w1, w2):
        """
         check if lcs word is used as type in DBpedia

        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        lcs = self._yago.least_common_subsumer(s1, s2)
        yago_concept = self._yago.synset2yago(lcs)
        graph_ic = self._yago._graph_ic.concept_ic(yago_concept)
        return True if graph_ic else False



    def check_word_type(self, w1, w2):
        """
        check if both words are used as type in DBpedia
        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        yago_concept_1 = self._yago.synset2yago(s1)
        yago_concept_2 = self._yago.synset2yago(s2)
        graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1)
        graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2)
        return True if graph_ic_1 and graph_ic_2 else False


    def check_word_noun(self, w1, w2):
        """
        check if both words are in WordNet Noun Taxonomy
        :param w1:
        :param w2:
        :return:
        """
        s1 = self._yago.word2synset(w1)
        s2 = self._yago.word2synset(w2)
        return True if s1 and s2 else False

    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
Ejemplo n.º 13
0
# ----------------------------------------------------------------
'''
Description  ------------------------------------------------------------------------
Function will define YAGO concepts and calculate calculates similarity score between
sentence 1 and sentence 2 (very similar to PartialSim-function).

Inputs  ------------------------------------------------------------------------------
s1      sentence 1 (string)
s2      sentence 2 (string)
method  "wpath" or "wpath_graph" (string)

Outputs ----------------------------------------------------------------------------
Returns the similarity value in numeric format (between 1 and 0).
'''
#Load YAGO
sim_yago = YagoTypeSimilarity()


#Function for calculating the sentence similarities using YAGO concepts
def task4Yago(s1, s2, method):
    #Format the input sentences to desired form
    s1 = s1.lower()
    s2 = s2.lower()
    #Separate sentence into words. Aka list of words.
    s1_words = word_tokenize(s1)
    s2_words = word_tokenize(s2)
    #POS tags for each word in sentence.
    pos1 = pos_tag(s1_words)
    pos2 = pos_tag(s2_words)
    #Remove stop words from the pos, tagged sentences
    pos1 = [word for word in pos1 if word[0] not in stopwords.words('english')]