class WordSimDataset: """ This class is used to prepare and separate word similarity datasets. """ def __init__(self): self._yago = YagoTypeSimilarity() def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = list(map(float, map(lambda x: x.split()[2], data))) return word_pairs, human def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name)) data = list(map(float, data)) return data[0], data[1:] def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data) def check_word_graph(self, w1, w2): """ check if lcs word is used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) lcs = self._yago.least_common_subsumer(s1, s2) yago_concept = self._yago.synset2yago(lcs) graph_ic = self._yago._graph_ic.concept_ic(yago_concept) return True if graph_ic else False def check_word_type(self, w1, w2): """ check if both words are used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) yago_concept_1 = self._yago.synset2yago(s1) yago_concept_2 = self._yago.synset2yago(s2) graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1) graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2) return True if graph_ic_1 and graph_ic_2 else False def check_word_noun(self, w1, w2): """ check if both words are in WordNet Noun Taxonomy :param w1: :param w2: :return: """ s1 = self._yago.word2synset(w1) s2 = self._yago.word2synset(w2) return True if s1 and s2 else False def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
class Matcher: """This class is used for concept based entity match in DBpedia""" def __init__(self, result_limit=5000, expansion=False, show_query=False): """ semantic search of entities and concepts :param result_limit: maximumn number of retrieved entities :param expansion: if conduct concept expansion :param show_query: if SPARQL query is shown """ self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit) def type_links(self, word, lang='eng'): synsets = self._yago.multilingual2synset(word, lang=lang) if self._expansion: synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets]))) links = [] for s in synsets: link_dic = {} link_dic['name'] = s.name() link_dic['gloss'] = s._definition link_dic['lemma'] = ' '.join(s._lemma_names) concept_link = [] yago_link = self._yago.synset2yago(s) dbpedia_link = self._yago.synset2dbpedia(s) concept_link.append(yago_link) if yago_link else None concept_link.append(dbpedia_link) if dbpedia_link else None link_dic['lod'] = concept_link if link_dic['lod']: links.append(link_dic) return links def query_process(self, query): """ Process query into concept (common noun) and entity (proper noun). Link them to Knowledge Graph uri links respectively. :param query: short text query :return: tuple of concepts and entities in uris. """ entities = self._extracter.extract_chunks_sent(query) entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities])) entity_filter = set(entity_filter) concepts = list(set(self._extracter.extract_nouns(query))) concepts = [c for c in concepts if c not in entity_filter] concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts] concept_uris = list(itertools.chain.from_iterable(concept_uris)) entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities))) return list(set(concept_uris)), list(set(entity_uris)) def match_concepts(self, concepts, lang='en'): results = [] for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res return [result_dic[key] for key in result_dic.keys()] def match_type(self, query, lang='eng'): lang_map = {'eng':'en','spa':'es', 'cmn':'zh'} result_lang = lang_map[lang] words = query.split() concept_uris = [] for w in words: concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)])) concept_uris.extend(concepts) concept_uris = list(set(concept_uris)) return self.match_concepts(concept_uris, result_lang) def match_entity_type(self, query): results = [] concepts, entities = self.query_process(query) for e in entities: for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res result = [result_dic[key] for key in result_dic.keys()] return result
class WordSimDataset: """ This class is used to prepare and separate word similarity datasets. """ def __init__(self): self._yago = YagoTypeSimilarity() def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = map(float, map(lambda x: x.split()[2], data)) return word_pairs, human def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name)) data = map(float, data) return data[0], data[1:] def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data) def check_word_graph(self, w1, w2): """ check if lcs word is used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) lcs = self._yago.least_common_subsumer(s1, s2) yago_concept = self._yago.synset2yago(lcs) graph_ic = self._yago._graph_ic.concept_ic(yago_concept) return True if graph_ic else False def check_word_type(self, w1, w2): """ check if both words are used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) yago_concept_1 = self._yago.synset2yago(s1) yago_concept_2 = self._yago.synset2yago(s2) graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1) graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2) return True if graph_ic_1 and graph_ic_2 else False def check_word_noun(self, w1, w2): """ check if both words are in WordNet Noun Taxonomy :param w1: :param w2: :return: """ s1 = self._yago.word2synset(w1) s2 = self._yago.word2synset(w2) return True if s1 and s2 else False def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)