class EntitySearch: """This class is used for concept based entity search in DBpedia""" def __init__(self): self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph() def query_process(self, query): """ Process query into concept (common noun) and entity (proper noun). Link them to Knowledge Graph uri links respectively. :param query: short text query :return: tuple of concepts and entities in uris. """ concepts = self._extracter.extract_words_sent(query) entities = self._extracter.extract_chunks_sent(query) concept_uris = list(itertools.chain.from_iterable(map(self._yago.word2yago, concepts))) entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities))) return list(set(concept_uris)), list(set(entity_uris)) def search(self, query): results = [] concepts, entities = self.query_process(query) for e in entities: for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e)) return list(set(results))
def __init__(self, result_limit=5000, expansion=True, show_query=False): self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit)
def test_extraction(): from sematch.nlp import Extraction from sematch.sparql import EntityFeatures entity_f = EntityFeatures() yin_and_yang = entity_f.features('http://dbpedia.org/resource/Yin_and_yang') assert yin_and_yang is not None extract = Extraction() assert 'Chinese' in extract.extract_chunks_doc(yin_and_yang['abstract']) assert 'philosophy' in extract.extract_words_doc(yin_and_yang['abstract'])
def test_extraction(): from sematch.nlp import Extraction from sematch.sparql import EntityFeatures entity_f = EntityFeatures() yin_and_yang = entity_f.features( 'http://dbpedia.org/resource/Yin_and_yang') assert yin_and_yang is not None extract = Extraction() assert 'Chinese' in extract.extract_chunks_doc(yin_and_yang['abstract']) assert 'philosophy' in extract.extract_words_doc(yin_and_yang['abstract'])
def __init__(self, result_limit=5000, expansion=False, show_query=False): """ semantic search of entities and concepts :param result_limit: maximumn number of retrieved entities :param expansion: if conduct concept expansion :param show_query: if SPARQL query is shown """ self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit)
def test_extraction(): from sematch.nlp import Extraction from sematch.semantic.sparql import EntityFeatures upm = EntityFeatures().features('http://dbpedia.org/resource/Technical_University_of_Madrid') extract = Extraction() assert extract.extract_nouns(upm['abstract']) is not None assert extract.extract_verbs(upm['abstract']) is not None assert extract.extract_chunks_doc(upm['abstract']) is not None cats = extract.category_features(upm['category']) assert extract.category2words(cats) is not None
def test_sim_graph(): from sematch.semantic.graph import SimGraph from sematch.semantic.similarity import WordNetSimilarity from sematch.nlp import Extraction, lemmatization from sematch.sparql import EntityFeatures from collections import Counter madrid = EntityFeatures().features( 'http://dbpedia.org/resource/Tom_Cruise') words = Extraction().extract_words_sent(madrid['abstract']) words = list(set(lemmatization(words))) wns = WordNetSimilarity() word_graph = SimGraph(words, wns.word_similarity) word_scores = word_graph.page_rank() words, scores = zip(*Counter(word_scores).most_common(10)) assert words is not None
def __init__(self): self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph()
class Matcher: """This class is used for concept based entity match in DBpedia""" def __init__(self, result_limit=5000, expansion=False, show_query=False): """ semantic search of entities and concepts :param result_limit: maximumn number of retrieved entities :param expansion: if conduct concept expansion :param show_query: if SPARQL query is shown """ self._expansion = expansion self._show_query = show_query self._linker = NameSPARQL() self._extracter = Extraction() self._yago = YagoTypeSimilarity() self._query_graph = QueryGraph(result_limit) def type_links(self, word, lang='eng'): synsets = self._yago.multilingual2synset(word, lang=lang) if self._expansion: synsets = list(set(itertools.chain.from_iterable([self._yago.synset_expand(s) for s in synsets]))) links = [] for s in synsets: link_dic = {} link_dic['name'] = s.name() link_dic['gloss'] = s._definition link_dic['lemma'] = ' '.join(s._lemma_names) concept_link = [] yago_link = self._yago.synset2yago(s) dbpedia_link = self._yago.synset2dbpedia(s) concept_link.append(yago_link) if yago_link else None concept_link.append(dbpedia_link) if dbpedia_link else None link_dic['lod'] = concept_link if link_dic['lod']: links.append(link_dic) return links def query_process(self, query): """ Process query into concept (common noun) and entity (proper noun). Link them to Knowledge Graph uri links respectively. :param query: short text query :return: tuple of concepts and entities in uris. """ entities = self._extracter.extract_chunks_sent(query) entity_filter = list(itertools.chain.from_iterable([e.lower().split() for e in entities])) entity_filter = set(entity_filter) concepts = list(set(self._extracter.extract_nouns(query))) concepts = [c for c in concepts if c not in entity_filter] concept_uris = [list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(c)])) for c in concepts] concept_uris = list(itertools.chain.from_iterable(concept_uris)) entity_uris = list(itertools.chain.from_iterable(map(self._linker.name2entities, entities))) return list(set(concept_uris)), list(set(entity_uris)) def match_concepts(self, concepts, lang='en'): results = [] for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_query(concepts[i:i + 5], lang, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res return [result_dic[key] for key in result_dic.keys()] def match_type(self, query, lang='eng'): lang_map = {'eng':'en','spa':'es', 'cmn':'zh'} result_lang = lang_map[lang] words = query.split() concept_uris = [] for w in words: concepts = list(itertools.chain.from_iterable([s['lod'] for s in self.type_links(w,lang)])) concept_uris.extend(concepts) concept_uris = list(set(concept_uris)) return self.match_concepts(concept_uris, result_lang) def match_entity_type(self, query): results = [] concepts, entities = self.query_process(query) for e in entities: for i in xrange(0, len(concepts), 5): results.extend(self._query_graph.type_entity_query(concepts[i:i + 5], e, self._show_query)) result_dic = {} for res in results: if res['uri'] not in result_dic: result_dic[res['uri']] = res result = [result_dic[key] for key in result_dic.keys()] return result