Example #1
0
def link_mention(entity_tokens: List, entity_tags: Iterable = None) -> List:
    """
    Link the given list of tokens to an entity in a knowledge base.
    The list is sorted by candidate entity frequency estimated on Wikipedia.

    :param entity_tokens: a list of tokens
    :param entity_tags: optional, part of speech tags for the entity tokens
    :return: a list of linkings as dictionaries where the "kbID" field contains the entity id
    >>> link_mention(['Martin', 'Luther', 'King', 'Junior'])
    [[('Q8027', 'Martin Luther King, Jr.'), ('Q6776048', 'Martin Luther King, Jr.')]]
    >>> link_mention(['movie'])
    [[('Q11424', 'film'), ('Q1179487', 'Movies'), ('Q6926907', 'Movies')]]
    >>> link_mention(['lord', 'of', 'the', 'rings'])
    [[('Q15228', 'The Lord of the Rings'), ('Q127367', 'The Lord of the Rings: The Fellowship of the Ring'), ('Q131074', 'The Lord of the Rings')]]
    >>> link_mention(['doin', 'me', ','])
    []
    >>> link_mention(['#justinbieber'])
    []
    """
    entity_tokens = [t for t in entity_tokens if t not in punctuation]
    if all(e.lower() in utils.stop_words_en for e in entity_tokens):
        return []

    query = queries.query_get_entity_by_label(entity_tokens)
    linkings = wdaccess.query_wikidata(query)
    if not linkings and entity_tags and all(
            t.startswith("NN") for t in entity_tags):
        entity_lemmas = utils.lemmatize_tokens(entity_tokens)
        if [l.lower()
                for l in entity_lemmas] != [t.lower() for t in entity_tokens]:
            linkings += wdaccess.query_wikidata(
                queries.query_get_entity_by_label(entity_lemmas))

    linkings = _post_process_entity_linkings(linkings, entity_tokens)
    return linkings
def map_wikipedia_id(wikipedia_article_id):
    """
    Map the given Wikipedia article URL (id) to a Wikidata id

    :param wikipedia_article_id: Wikipedia id as a string
    :return: Wikidata kbID
    >>> map_wikipedia_id("PIAS_Entertainment_Group")
    'Q7119302'
    >>> map_wikipedia_id("Swimming_(sport)")
    'Q31920'
    >>> map_wikipedia_id("José_Reyes_(shortstop)")
    'Q220096'
    >>> map_wikipedia_id("The_Twilight_Saga:_New_Moon")
    'Q116928'
    >>> map_wikipedia_id("betty_ford_center")
    'Q850360'
    >>> map_wikipedia_id("1976_democratic_national_convention")
    'Q16152917'
    """
    results = wdaccess.query_wikidata(
        query_map_wikipedia_id(wikipedia_article_id))
    if results and 'e2' in results[0]:
        return results[0]['e2']
    response = urllib.request.urlopen(
        "https://en.wikipedia.org/w/api.php?action=query&redirects=1&format=json&prop=info&inprop=url&titles="
        + urllib.parse.quote(wikipedia_article_id))
    encoding = response.info().get_content_charset("utf-8")
    json_response = json.loads(response.read().decode(encoding))
    if 'query' in json_response and 'pages' in json_response['query']:
        json_response = list(json_response['query']['pages'].items())
        k, value = json_response[0]
        if k != -1 and 'canonicalurl' in value:
            canonical_article_url = urllib.parse.unquote(value['canonicalurl'])
            results = wdaccess.query_wikidata(
                query_map_wikipedia_id(canonical_article_url))
            if results and 'e2' in results[0]:
                return results[0]['e2']

    capitalized = "_".join(
        [token.title() for token in wikipedia_article_id.split("_")])
    if capitalized != wikipedia_article_id:
        return map_wikipedia_id(capitalized)
    return None
def get_main_entity_label(entity):
    """
    Retrieve the main label of the given entity. None is returned if no label could be found.

    :param entity: entity KB ID
    :return: entity label as a string
    >>> get_main_entity_label("Q12143")
    'time zone'
    """
    results = wdaccess.query_wikidata(query_get_main_entity_label(entity))
    if results and 'label' in results[0]:
        return results[0]['label']
    return None
def get_mapped_entity_type(entity_id):
    entity_types = wdaccess.query_wikidata(query_get_entity_classes(
        entity_id, only_direct_type=False),
                                           prefix=None)
    etype = None
    i = 0
    while etype is None and i < len(mapped_types_sorted):
        if mapped_types_sorted[i] in {t['label'] for t in entity_types}:
            etype = mapped_types_sorted[i]
            return mapped_types[etype]
        i += 1
    if etype is None:
        return "other"
    return etype
def map_f_id(f_id):
    """
    Map the given Freebase id to a Wikidata id

    :param f_id: Freebase id as a string
    :return: Wikidata kbID
    """
    f_id = f_id.replace(".", "/")
    if not f_id.startswith("/"):
        f_id = "/" + f_id
    results = wdaccess.query_wikidata(query_map_freebase_id(f_id))
    if results and 'e2' in results[0]:
        return results[0]['e2']
    return None
def get_semantic_signature(entity):
    """
    Extract the semantic signature (all related entities and relations) of the given entity.

    :param entity: Wikidata id as a string
    :return: list of strings of related entities and relations
    >>> len(get_semantic_signature("Q76")[0])
    518
    >>> get_semantic_signature("Q179641") is not None
    True
    >>> get_semantic_signature("Q1963799")
    (set(), set())
    >>> ('instance of', 'P31') in get_semantic_signature("Q15862")[1]
    True
    """
    results = wdaccess.query_wikidata(query_semantic_signature(entity))
    results = wdaccess.filter_relations(results, b='p', freq_threshold=10)
    related_entities = {(r['label'], r['e1'], r['p'][:-1]) for r in results}
    relations = {(wdscheme.property2label.get(r['p'][:-1],
                                              {}).get("label"), r['p'][:-1])
                 for r in results}
    return related_entities, relations