def __init__(self, result_limit=5000, expansion=True, show_query=False): self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit)
def __init__(self, result_limit=5000, expansion=False, show_query=False): """ semantic search of entities and concepts :param result_limit: maximumn number of retrieved entities :param expansion: if conduct concept expansion :param show_query: if SPARQL query is shown """ self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit)
def __init__(self, result_limit=5000, expansion=True, show_query=False): self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit)
simfile.write('\n\n') sim_ref = np.array(contents)[:, 2].astype(float) / 4.0 corr = pearson_correlation(sim_cal, sim_ref) with open('results.txt', 'a') as resfile: resfile.write( 'pearson correlation in dataset [%s] for FastText embedding is %f\n' % ('STS-131', corr)) # part 8 with open('datasets/stss-131.csv', newline='') as csvfile: contents = list(csv.reader(csvfile, delimiter=';')) from sematch.semantic.similarity import YagoTypeSimilarity sim = YagoTypeSimilarity() sim_cal = np.array(sentence_similarity_dataset_yago(contents, sim)).reshape(-1, ) with open('sentence_similarity.txt', 'a') as simfile: simfile.write('Using Yago concepts\n') simfile.write('s1; s2; human_sim; method_sim\n\n') for i, pair in enumerate(contents): simfile.write('%s;%s;%s;%f\n' % (pair[0], pair[1], pair[2], sim_cal[i] * 4)) simfile.write('\n\n') sim_ref = np.array(contents)[:, 2].astype(float) / 4.0 corr = pearson_correlation(sim_cal, sim_ref) with open('results.txt', 'a') as resfile:
def test_yagotype_similarity(): from sematch.semantic.similarity import YagoTypeSimilarity yagosim = YagoTypeSimilarity() dancer = yagosim.word2yago('dancer') actor = yagosim.word2yago('actor') singer = yagosim.word2yago('singer') assert yagosim.yago2synset(actor[0]) is not None assert yagosim.yago_similarity(dancer[0], actor[0], 'wpath') is not None assert yagosim.yago_similarity(singer[0], actor[0], 'wpath') is not None assert yagosim.word2yago('university') is not None assert yagosim.yago2synset('http://dbpedia.org/class/yago/EducationalInstitution108276342') is not None assert yagosim.yago2synset('http://dbpedia.org/class/yago/Organization108008335') is not None assert yagosim.yago2synset('http://dbpedia.org/class/yago/Institution108053576') is not None assert yagosim.yago2synset('http://dbpedia.org/class/yago/Organization108008335') is not None #using corpus-based IC from brown corpus assert yagosim.word_similarity('dancer', 'actor', 'wpath') is not None #using graph-based IC from DBpedia assert yagosim.word_similarity('dancer', 'actor', 'wpath_graph') is not None
def __init__(self): self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph()
class Matcher: """This class is used for concept based entity match in DBpedia""" def __init__(self, result_limit=5000, expansion=False, show_query=False): """ semantic search of entities and concepts :param result_limit: maximumn number of retrieved entities :param expansion: if conduct concept expansion :param show_query: if SPARQL query is shown """ self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit) def type_links(self, word, lang='eng'): synsets = self._yago.multilingual2synset(word, lang=lang) if self._expansion: synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets]))) links = [] for s in synsets: link_dic = {} link_dic['name'] = s.name() link_dic['gloss'] = s._definition link_dic['lemma'] = ' '.join(s._lemma_names) concept_link = [] yago_link = self._yago.synset2yago(s) dbpedia_link = self._yago.synset2dbpedia(s) concept_link.append(yago_link) if yago_link else None concept_link.append(dbpedia_link) if dbpedia_link else None link_dic['lod'] = concept_link if link_dic['lod']: links.append(link_dic) return links def query_process(self, query): """ Process query into concept (common noun) and entity (proper noun). Link them to Knowledge Graph uri links respectively. :param query: short text query :return: tuple of concepts and entities in uris. """ entities = self._extracter.extract_chunks_sent(query) entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities])) entity_filter = set(entity_filter) concepts = list(set(self._extracter.extract_nouns(query))) concepts = [c for c in concepts if c not in entity_filter] concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts] concept_uris = list(itertools.chain.from_iterable(concept_uris)) entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities))) return list(set(concept_uris)), list(set(entity_uris)) def match_concepts(self, concepts, lang='en'): results = [] for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res return [result_dic[key] for key in result_dic.keys()] def match_type(self, query, lang='eng'): lang_map = {'eng':'en','spa':'es', 'cmn':'zh'} result_lang = lang_map[lang] words = query.split() concept_uris = [] for w in words: concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)])) concept_uris.extend(concepts) concept_uris = list(set(concept_uris)) return self.match_concepts(concept_uris, result_lang) def match_entity_type(self, query): results = [] concepts, entities = self.query_process(query) for e in entities: for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res result = [result_dic[key] for key in result_dic.keys()] return result
def test_yago_concept_similarity(): from sematch.semantic.similarity import YagoTypeSimilarity yagosim = YagoTypeSimilarity() dancer = yagosim.word2yago('dancer') actor = yagosim.word2yago('actor') singer = yagosim.word2yago('singer') assert yagosim.yago2synset(actor[0]) is not None assert yagosim.yago_similarity(dancer[0], actor[0], 'wpath') is not None assert yagosim.yago_similarity(singer[0], actor[0], 'wpath') is not None assert yagosim.word2yago('university') is not None assert yagosim.yago2synset( 'http://dbpedia.org/class/yago/EducationalInstitution108276342' ) is not None assert yagosim.yago2synset( 'http://dbpedia.org/class/yago/Organization108008335') is not None assert yagosim.yago2synset( 'http://dbpedia.org/class/yago/Institution108053576') is not None assert yagosim.yago2synset( 'http://dbpedia.org/class/yago/Organization108008335') is not None #using corpus-based IC from brown corpus assert yagosim.word_similarity('dancer', 'actor', 'wpath') is not None #using graph-based IC from DBpedia assert yagosim.word_similarity('dancer', 'actor', 'wpath_graph') is not None
def __init__(self): self._yago = YagoTypeSimilarity()
class WordSimDataset: """ This class is used to prepare and separate word similarity datasets. """ def __init__(self): self._yago = YagoTypeSimilarity() def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('dataset/wordsim/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = list(map(float, map(lambda x: x.split()[2], data))) return word_pairs, human def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name)) data = list(map(float, data)) return data[0], data[1:] def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('dataset/wordsim/results/%s-%s.txt' % (dataset_name, sim_name), data) def check_word_graph(self, w1, w2): """ check if lcs word is used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) lcs = self._yago.least_common_subsumer(s1, s2) yago_concept = self._yago.synset2yago(lcs) graph_ic = self._yago._graph_ic.concept_ic(yago_concept) return True if graph_ic else False def check_word_type(self, w1, w2): """ check if both words are used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) yago_concept_1 = self._yago.synset2yago(s1) yago_concept_2 = self._yago.synset2yago(s2) graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1) graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2) return True if graph_ic_1 and graph_ic_2 else False def check_word_noun(self, w1, w2): """ check if both words are in WordNet Noun Taxonomy :param w1: :param w2: :return: """ s1 = self._yago.word2synset(w1) s2 = self._yago.word2synset(w2) return True if s1 and s2 else False def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('dataset/wordsim/%s.txt' % out_file, out_data)
def __init__(self): self._yago = YagoTypeSimilarity()
class WordSimDataset: """ This class is used to prepare and separate word similarity datasets. """ def __init__(self): self._yago = YagoTypeSimilarity() def load_dataset(self, dataset_name): """ This function loads the word similarity dataset :param dataset_name: the file name of word similarity dataset :return: word pairs and huamn ratings """ data = FileIO.read_list_file('eval/word_similarity/%s.txt' % dataset_name) #print "dataset ", dataset_name, " ", len(data), " word pairs" word_pairs = map(lambda x: (x.split()[0], x.split()[1]), data) human = map(float, map(lambda x: x.split()[2], data)) return word_pairs, human def load_result(self, sim_name, dataset_name): """ This function loads the result of a similarity metric for a specific dataset :param sim_name: the name similarity metric :param dataset_name: the name of word similarity dataset :return: cor relation score and rating scores generated by similarity metric """ data = FileIO.read_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name)) data = map(float, data) return data[0], data[1:] def save_result(self, cor, sim_values, sim_name, dataset_name): """ This function save the result computed by a similarity metric :param cor: correlation with human rating :param sim_values: similarity scores for word pairs :param sim_name: the name of similarity metric :param dataset_name: the name of word similarity dataset :return: """ data = ["%.3f" % cor] data += map(lambda x: "%.3f" % x, sim_values) FileIO.save_list_file('eval/word_similarity/results/%s-%s.txt' % (dataset_name, sim_name), data) def check_word_graph(self, w1, w2): """ check if lcs word is used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) lcs = self._yago.least_common_subsumer(s1, s2) yago_concept = self._yago.synset2yago(lcs) graph_ic = self._yago._graph_ic.concept_ic(yago_concept) return True if graph_ic else False def check_word_type(self, w1, w2): """ check if both words are used as type in DBpedia :param w1: :param w2: :return: """ s1, s2 = self._yago.best_synset_pair(w1, w2) yago_concept_1 = self._yago.synset2yago(s1) yago_concept_2 = self._yago.synset2yago(s2) graph_ic_1 = self._yago._graph_ic.concept_ic(yago_concept_1) graph_ic_2 = self._yago._graph_ic.concept_ic(yago_concept_2) return True if graph_ic_1 and graph_ic_2 else False def check_word_noun(self, w1, w2): """ check if both words are in WordNet Noun Taxonomy :param w1: :param w2: :return: """ s1 = self._yago.word2synset(w1) s2 = self._yago.word2synset(w2) return True if s1 and s2 else False def separate_dataset(self, in_file, out_file, check_function): """ This function is used to separate the original word similarity dataset. word similarity of noun: noun_rg.txt, noun_mc.txt, noun_ws353.txt, noun_ws353-sim.txt, noun_simlex.txt the lcs is in knowledge graph: graph_rg.txt, graph_mc.txt, graph_ws353.txt, graph_ws353-sim.txt, graph_simlex.txt both words are in knowledge graph: type_rg.txt, type_mc.txt, type_ws353.txt, type_ws353-sim.txt, type_simlex.txt :param in_file: source dataset file :param out_file: target dataset file :param check_function: the function of mapping criteria for deciding the word pairs. :return: """ out_data = [] word_pairs, human = self.load_dataset(in_file) for i, pairs in enumerate(word_pairs): w1, w2 = pairs h = human[i] if check_function(w1, w2): out_data.append(' '.join([w1, w2, str(h)])) FileIO.save_list_file('eval/word_similarity/%s.txt' % out_file, out_data)
# ---------------------------------------------------------------- ''' Description ------------------------------------------------------------------------ Function will define YAGO concepts and calculate calculates similarity score between sentence 1 and sentence 2 (very similar to PartialSim-function). Inputs ------------------------------------------------------------------------------ s1 sentence 1 (string) s2 sentence 2 (string) method "wpath" or "wpath_graph" (string) Outputs ---------------------------------------------------------------------------- Returns the similarity value in numeric format (between 1 and 0). ''' #Load YAGO sim_yago = YagoTypeSimilarity() #Function for calculating the sentence similarities using YAGO concepts def task4Yago(s1, s2, method): #Format the input sentences to desired form s1 = s1.lower() s2 = s2.lower() #Separate sentence into words. Aka list of words. s1_words = word_tokenize(s1) s2_words = word_tokenize(s2) #POS tags for each word in sentence. pos1 = pos_tag(s1_words) pos2 = pos_tag(s2_words) #Remove stop words from the pos, tagged sentences pos1 = [word for word in pos1 if word[0] not in stopwords.words('english')]