def link_mention(entity_tokens: List, entity_tags: Iterable = None) -> List: """ Link the given list of tokens to an entity in a knowledge base. The list is sorted by candidate entity frequency estimated on Wikipedia. :param entity_tokens: a list of tokens :param entity_tags: optional, part of speech tags for the entity tokens :return: a list of linkings as dictionaries where the "kbID" field contains the entity id >>> link_mention(['Martin', 'Luther', 'King', 'Junior']) [[('Q8027', 'Martin Luther King, Jr.'), ('Q6776048', 'Martin Luther King, Jr.')]] >>> link_mention(['movie']) [[('Q11424', 'film'), ('Q1179487', 'Movies'), ('Q6926907', 'Movies')]] >>> link_mention(['lord', 'of', 'the', 'rings']) [[('Q15228', 'The Lord of the Rings'), ('Q127367', 'The Lord of the Rings: The Fellowship of the Ring'), ('Q131074', 'The Lord of the Rings')]] >>> link_mention(['doin', 'me', ',']) [] >>> link_mention(['#justinbieber']) [] """ entity_tokens = [t for t in entity_tokens if t not in punctuation] if all(e.lower() in utils.stop_words_en for e in entity_tokens): return [] query = queries.query_get_entity_by_label(entity_tokens) linkings = wdaccess.query_wikidata(query) if not linkings and entity_tags and all( t.startswith("NN") for t in entity_tags): entity_lemmas = utils.lemmatize_tokens(entity_tokens) if [l.lower() for l in entity_lemmas] != [t.lower() for t in entity_tokens]: linkings += wdaccess.query_wikidata( queries.query_get_entity_by_label(entity_lemmas)) linkings = _post_process_entity_linkings(linkings, entity_tokens) return linkings
def map_wikipedia_id(wikipedia_article_id): """ Map the given Wikipedia article URL (id) to a Wikidata id :param wikipedia_article_id: Wikipedia id as a string :return: Wikidata kbID >>> map_wikipedia_id("PIAS_Entertainment_Group") 'Q7119302' >>> map_wikipedia_id("Swimming_(sport)") 'Q31920' >>> map_wikipedia_id("José_Reyes_(shortstop)") 'Q220096' >>> map_wikipedia_id("The_Twilight_Saga:_New_Moon") 'Q116928' >>> map_wikipedia_id("betty_ford_center") 'Q850360' >>> map_wikipedia_id("1976_democratic_national_convention") 'Q16152917' """ results = wdaccess.query_wikidata( query_map_wikipedia_id(wikipedia_article_id)) if results and 'e2' in results[0]: return results[0]['e2'] response = urllib.request.urlopen( "https://en.wikipedia.org/w/api.php?action=query&redirects=1&format=json&prop=info&inprop=url&titles=" + urllib.parse.quote(wikipedia_article_id)) encoding = response.info().get_content_charset("utf-8") json_response = json.loads(response.read().decode(encoding)) if 'query' in json_response and 'pages' in json_response['query']: json_response = list(json_response['query']['pages'].items()) k, value = json_response[0] if k != -1 and 'canonicalurl' in value: canonical_article_url = urllib.parse.unquote(value['canonicalurl']) results = wdaccess.query_wikidata( query_map_wikipedia_id(canonical_article_url)) if results and 'e2' in results[0]: return results[0]['e2'] capitalized = "_".join( [token.title() for token in wikipedia_article_id.split("_")]) if capitalized != wikipedia_article_id: return map_wikipedia_id(capitalized) return None
def get_main_entity_label(entity): """ Retrieve the main label of the given entity. None is returned if no label could be found. :param entity: entity KB ID :return: entity label as a string >>> get_main_entity_label("Q12143") 'time zone' """ results = wdaccess.query_wikidata(query_get_main_entity_label(entity)) if results and 'label' in results[0]: return results[0]['label'] return None
def get_mapped_entity_type(entity_id): entity_types = wdaccess.query_wikidata(query_get_entity_classes( entity_id, only_direct_type=False), prefix=None) etype = None i = 0 while etype is None and i < len(mapped_types_sorted): if mapped_types_sorted[i] in {t['label'] for t in entity_types}: etype = mapped_types_sorted[i] return mapped_types[etype] i += 1 if etype is None: return "other" return etype
def map_f_id(f_id): """ Map the given Freebase id to a Wikidata id :param f_id: Freebase id as a string :return: Wikidata kbID """ f_id = f_id.replace(".", "/") if not f_id.startswith("/"): f_id = "/" + f_id results = wdaccess.query_wikidata(query_map_freebase_id(f_id)) if results and 'e2' in results[0]: return results[0]['e2'] return None
def get_semantic_signature(entity): """ Extract the semantic signature (all related entities and relations) of the given entity. :param entity: Wikidata id as a string :return: list of strings of related entities and relations >>> len(get_semantic_signature("Q76")[0]) 518 >>> get_semantic_signature("Q179641") is not None True >>> get_semantic_signature("Q1963799") (set(), set()) >>> ('instance of', 'P31') in get_semantic_signature("Q15862")[1] True """ results = wdaccess.query_wikidata(query_semantic_signature(entity)) results = wdaccess.filter_relations(results, b='p', freq_threshold=10) related_entities = {(r['label'], r['e1'], r['p'][:-1]) for r in results} relations = {(wdscheme.property2label.get(r['p'][:-1], {}).get("label"), r['p'][:-1]) for r in results} return related_entities, relations