Esempio n. 1
0
class WordSimDataset:
    """
    This class is used to prepare and separate word similarity datasets.
    """

    def __init__(self):
        self._yago = YagoTypeSimilarity()

    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = list(map(float, map(lambda x: x.split()[2], data)))
        return word_pairs, human

    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name))
        data = list(map(float, data))
        return data[0], data[1:]

    def save_result(self, cor, sim_values, sim_name, dataset_name):
        """
        This function save the result computed by a similarity metric
        :param cor: correlation with human rating
        :param sim_values: similarity scores for word pairs
        :param sim_name: the name of similarity metric
        :param dataset_name: the name of word similarity dataset
        :return:
        """
        data = ["%.3f" % cor]
        data += map(lambda x: "%.3f" % x, sim_values)
        FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data)


    def check_word_graph(self, w1, w2):
        """
         check if lcs word is used as type in DBpedia

        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        lcs = self._yago.least_common_subsumer(s1, s2)
        yago_concept = self._yago.synset2yago(lcs)
        graph_ic = self._yago._graph_ic.concept_ic(yago_concept)
        return True if graph_ic else False


    def check_word_type(self, w1, w2):
        """
        check if both words are used as type in DBpedia
        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        yago_concept_1 = self._yago.synset2yago(s1)
        yago_concept_2 = self._yago.synset2yago(s2)
        graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1)
        graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2)
        return True if graph_ic_1 and graph_ic_2 else False


    def check_word_noun(self, w1, w2):
        """
        check if both words are in WordNet Noun Taxonomy
        :param w1:
        :param w2:
        :return:
        """
        s1 = self._yago.word2synset(w1)
        s2 = self._yago.word2synset(w2)
        return True if s1 and s2 else False

    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
class Matcher:

    """This class is used for concept based entity match in DBpedia"""

    def __init__(self, result_limit=5000, expansion=False, show_query=False):
        """ semantic search of entities and concepts

        :param result_limit: maximumn number of retrieved entities
        :param expansion: if conduct concept expansion
        :param show_query: if SPARQL query is shown
        """
        self._expansion = expansion
        self._show_query = show_query
        self._linker = NameSPARQL()
        self._extracter = Extraction()
        self._yago = YagoTypeSimilarity()
        self._query_graph = QueryGraph(result_limit)

    def type_links(self, word, lang='eng'):
        synsets = self._yago.multilingual2synset(word, lang=lang)
        if self._expansion:
            synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets])))
        links = []
        for s in synsets:
            link_dic = {}
            link_dic['name'] = s.name()
            link_dic['gloss'] = s._definition
            link_dic['lemma'] = ' '.join(s._lemma_names)
            concept_link = []
            yago_link = self._yago.synset2yago(s)
            dbpedia_link = self._yago.synset2dbpedia(s)
            concept_link.append(yago_link) if yago_link else None
            concept_link.append(dbpedia_link) if dbpedia_link else None
            link_dic['lod'] = concept_link
            if link_dic['lod']:
                links.append(link_dic)
        return links

    def query_process(self, query):
        """
        Process query into concept (common noun) and entity (proper noun). Link them
        to Knowledge Graph uri links respectively.
        :param query: short text query
        :return: tuple of concepts and entities in uris.
        """
        entities = self._extracter.extract_chunks_sent(query)
        entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities]))
        entity_filter = set(entity_filter)
        concepts = list(set(self._extracter.extract_nouns(query)))
        concepts = [c for c in concepts if c not in entity_filter]
        concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts]
        concept_uris = list(itertools.chain.from_iterable(concept_uris))
        entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities)))
        return list(set(concept_uris)), list(set(entity_uris))

    def match_concepts(self, concepts, lang='en'):
        results = []
        for i in xrange(0, len(concepts), 5):
            results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query))
        result_dic = {}
        for res in results:
            if res['uri'] not in result_dic:
                result_dic[res['uri']] = res
        return [result_dic[key] for key in result_dic.keys()]

    def match_type(self, query, lang='eng'):
        lang_map = {'eng':'en','spa':'es', 'cmn':'zh'}
        result_lang = lang_map[lang]
        words = query.split()
        concept_uris = []
        for w in words:
            concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)]))
            concept_uris.extend(concepts)
        concept_uris = list(set(concept_uris))
        return self.match_concepts(concept_uris, result_lang)

    def match_entity_type(self, query):
        results = []
        concepts, entities = self.query_process(query)
        for e in entities:
            for i in xrange(0, len(concepts), 5):
                results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query))
        result_dic = {}
        for res in results:
            if res['uri'] not in result_dic:
                result_dic[res['uri']] = res
        result = [result_dic[key] for key in result_dic.keys()]
        return result
Esempio n. 3
0
class WordSimDataset:
    """
    This class is used to prepare and separate word similarity datasets.
    """

    def __init__(self):
        self._yago = YagoTypeSimilarity()

    def load_dataset(self, dataset_name):
        """
         This function loads the word similarity dataset

        :param dataset_name: the file name of word similarity dataset
        :return: word pairs and huamn ratings
        """
        data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name)
        #print "dataset ", dataset_name, " ", len(data), " word pairs"
        word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data)
        human = map(float, map(lambda x: x.split()[2], data))
        return word_pairs, human

    def load_result(self, sim_name, dataset_name):
        """
        This function loads the result of a similarity metric for a specific dataset

        :param sim_name: the name similarity metric
        :param dataset_name: the name of word similarity dataset
        :return: cor relation score and rating scores generated by similarity metric
        """
        data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name))
        data = map(float, data)
        return data[0], data[1:]

    def save_result(self, cor, sim_values, sim_name, dataset_name):
        """
        This function save the result computed by a similarity metric
        :param cor: correlation with human rating
        :param sim_values: similarity scores for word pairs
        :param sim_name: the name of similarity metric
        :param dataset_name: the name of word similarity dataset
        :return:
        """
        data = ["%.3f" % cor]
        data += map(lambda x: "%.3f" % x, sim_values)
        FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data)


    def check_word_graph(self, w1, w2):
        """
         check if lcs word is used as type in DBpedia

        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        lcs = self._yago.least_common_subsumer(s1, s2)
        yago_concept = self._yago.synset2yago(lcs)
        graph_ic = self._yago._graph_ic.concept_ic(yago_concept)
        return True if graph_ic else False



    def check_word_type(self, w1, w2):
        """
        check if both words are used as type in DBpedia
        :param w1:
        :param w2:
        :return:
        """
        s1, s2 = self._yago.best_synset_pair(w1, w2)
        yago_concept_1 = self._yago.synset2yago(s1)
        yago_concept_2 = self._yago.synset2yago(s2)
        graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1)
        graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2)
        return True if graph_ic_1 and graph_ic_2 else False


    def check_word_noun(self, w1, w2):
        """
        check if both words are in WordNet Noun Taxonomy
        :param w1:
        :param w2:
        :return:
        """
        s1 = self._yago.word2synset(w1)
        s2 = self._yago.word2synset(w2)
        return True if s1 and s2 else False

    def separate_dataset(self, in_file, out_file, check_function):
        """
        This function is used to separate the original word similarity dataset.

        word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt

        the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt,
        graph_ws353-sim.txt, graph_simlex.txt

        both words are in knowledge graph:  type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt,
        type_simlex.txt

        :param in_file: source dataset file
        :param out_file: target dataset file
        :param check_function: the function of mapping criteria for deciding the word pairs.
        :return:
        """
        out_data = []
        word_pairs, human = self.load_dataset(in_file)
        for i, pairs in enumerate(word_pairs):
            w1, w2 = pairs
            h = human[i]
            if check_function(w1, w2):
                out_data.append(' '.join([w1, w2, str(h)]))
        FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)