Example #1
0
def get_author(plain_entity: str, return_plain=False, mode="book") -> Optional[str]:
    """
    Get the author for a plain book entity
    """
    logger.info(f"Calling get_author for {plain_entity}")
    logger.debug(f"Search author with entity {plain_entity.upper()}")
    if mode == "book":
        author_list = request_triples_wikidata(
            "find_object",
            [
                (plain_entity.upper(), "P50", "forw"),
                (plain_entity.upper(), "P800", "backw"),
            ],
            query_dict=book_query_dict,
        )
    else:
        author_list = request_triples_wikidata("find_object", [(plain_entity.upper(), "P57", "forw")], query_dict={})
    logger.info(f"Author list received {author_list}")
    author_list = list(itertools.chain.from_iterable(author_list))
    author_list = list(set(author_list))
    author_list = [x[x.find("Q") :] for x in author_list]  # to unify representations
    sorted_author_list = sorted(author_list, key=lambda x: int(x[1:]))  # Sort entities by frequency
    if not sorted_author_list:
        return None
    author_entity = sorted_author_list[0]
    if return_plain:
        logger.info(f"Answer {author_entity}")
        return author_entity
    if is_wikidata_entity(author_entity):
        author_name = entity_to_label(author_entity)
        logger.info(f"Answer for get_author {author_name}")
        return author_name
    else:
        logger.warning(f"Wrong entity {author_entity}")
        return None
Example #2
0
def what_is_book_about(book: Optional[str] = None) -> Optional[str]:
    """
    Fetch facts for a book
    """
    if not book:
        return None
    fact = None
    logger.info(f"Requesting for {book}")
    if is_wikidata_entity(book):
        plain_books = [book]
    else:
        plain_books, _ = request_entities_entitylinking(book, types=BOOK_WIKI_TYPES)
        logger.info(f"After request {plain_books}")
    if plain_books:
        plain_book = plain_books[0]
        subjects = request_triples_wikidata("find_object", [(plain_book, "P921", "forw")], query_dict={})[0]
        if subjects:
            fact = f"The main subject of this book is {entity_to_label(subjects[0])}."
        locations = request_triples_wikidata("find_object", [(plain_book, "P840", "forw")], query_dict={})[0]
        if len(locations) > 1:
            fact = f"{fact} Apart from other locations,"
        if locations:
            fact = f"{fact} The action of this book takes place in {entity_to_label(locations[0])}."
        if not subjects or not locations:
            characters = request_triples_wikidata("find_object", [(plain_book, "P674", "forw")], query_dict={})[0]
            if characters:
                fact = f"{fact} One of the main characters of this book is {entity_to_label(characters[0])}."
    logger.info(f"Final fact {fact}")
    return fact
Example #3
0
def get_published_year(book_entity: str) -> Optional[str]:
    """
    Extract the publication date
    """
    assert is_wikidata_entity(book_entity)
    book_entity = book_entity.strip()
    published_year = None
    published_year_list = request_triples_wikidata(
        "find_object", [(book_entity, "P577", "forw")], query_dict=book_query_dict
    )
    logger.info(f"Received {published_year_list}")
    if isinstance(published_year_list, str):
        published_year = published_year_list
    else:
        while published_year_list and type(published_year_list[0]) == list:
            # Support different formats of wikiparser output
            published_year_list = published_year_list[0]
        if published_year_list and type(published_year_list[0]) == str:
            published_year = published_year_list[0]
        else:
            return None
    year_candidates = re.findall(r"[\d]{3,4}", published_year)
    if year_candidates:
        try:
            published_year: str = get_n_years(year_candidates[0])  # Changed to return a string
            assert published_year
        except Exception:
            # sentry_sdk.capture_exception(e)
            logger.exception(f"Could not obtain published year from {published_year_list}")
            return None
    logger.info(f"Answer for get_published_year {published_year}")
    return published_year
Example #4
0
def get_plain_genres(plain_bookname: str) -> list:
    plain_genres = request_triples_wikidata(
        "find_object", [(plain_bookname, "P136", "forw")], query_dict=book_query_dict
    )
    MAX_DEPTH = 5
    for _ in range(MAX_DEPTH):
        if plain_genres and isinstance(plain_genres[0], list):
            plain_genres = plain_genres[0]
    logger.debug(f"Plain_genres {plain_genres}")
    return plain_genres
Example #5
0
def get_booklist(plain_author_name: str) -> str:
    book_list = request_triples_wikidata(
        "find_object",
        [(plain_author_name, "P800", "forw"), (plain_author_name, "P50", "backw")],
        query_dict=book_query_dict,
    )
    book_list = list(itertools.chain.from_iterable(book_list))
    book_list = list(set(book_list))
    book_list = [x[x.find("Q") :] for x in book_list if x]  # to unify representations
    book_list = sorted(book_list, key=lambda x: int(x[1:]))
    return book_list
Example #6
0
def get_top_people_from_wiki_for_cobot_topic(cobot_topic, top_people):
    raw_occupations_list = common_gossip.COBOT_TOPICS_TO_WIKI_OCCUPATIONS[
        cobot_topic]

    processed_occupations_tuple = tuple(
        [occupation_item[1] for occupation_item in raw_occupations_list])
    results = custom_requests.request_triples_wikidata(
        "find_top_people", [processed_occupations_tuple])
    results = results[0] if results else results
    if results:
        # if person is actually a ['Wikidata_ID', 'Display_Name']
        return [person_item[1] for person_item in results[0][0] if person_item]
    else:
        return []
Example #7
0
def entity_to_label(entity):
    """

    Args:
        entity: Wikidata entity for which we need to receive the label
        If should be string, with first letter Q and other from 0 to 9, like Q5321

    Returns:

        label: label from this entity.
        If entity is in wrong format we assume that it is already label but give exception

    """
    logger.debug(f"Calling entity_to_label for {entity}")
    no_entity = not entity
    wrong_entity_type = not isinstance(entity, str)
    wrong_entity_format = entity and (entity[0] != "Q" or any(
        [j not in "0123456789" for j in entity[1:]]))
    if no_entity or wrong_entity_type or wrong_entity_format:
        warning_text = f"Wrong entity format. We assume {entity} to be label but check the code"
        sentry_sdk.capture_exception(Exception(warning_text))
        logger.exception(warning_text)
        return entity
    label = ""
    labels = request_triples_wikidata("find_label", [(entity, "")])
    try:
        sep = '"'
        if sep in labels[0]:
            label = labels[0].split('"')[1]
        else:
            label = labels[0]
        logger.debug(f"Answer {label}")
    except Exception as e:
        sentry_sdk.capture_exception(e)
        logger.exception(
            Exception(e, "Exception in conversion of labels {labels}"))
    return label
Example #8
0
def get_name(
    annotated_phrase: dict,
    mode="author",
) -> Optional[Tuple[str, str, str]]:
    """
    Extract wiki entities of the specified type
    """
    plain_entity, found_entity, n_years_ago, attribute, film_director = (
        None,
        None,
        None,
        None,
        None,
    )
    try:
        all_found_entities = get_raw_entity_names_from_annotations(annotated_phrase.get("annotations", {}))
        if not all_found_entities:
            return None
        logger.info(f"Found entities in annotations {all_found_entities}")
        if mode == "author":
            types = AUTHOR_WIKI_TYPES
        elif mode == "book":
            types = BOOK_WIKI_TYPES
        elif mode == "movie":
            types = MOVIE_WIKI_TYPES
        else:
            raise Exception(f"Wrong mode: {mode}")
        n_years_ago = None
        wp_annotations = annotated_phrase.get("annotations", {}).get("wiki_parser", {})
        if isinstance(wp_annotations, list):
            wp_annotations = wp_annotations[0]
        toiterate_dict = wp_annotations.get("topic_skill_entities_info", {})
        for key in wp_annotations.get("entities_info", {}):
            if key not in toiterate_dict:
                toiterate_dict[key] = wp_annotations["entities_info"][key]
        keys = sorted(toiterate_dict, key=lambda x: -len(str(toiterate_dict[x])))
        #  logger.debug(toiterate_dict)
        #  To discern omonyms ( e.g serbian old king Stephen and Stephen King)
        #  we sort by the length of wikidata dict -
        # the more popular is the person the more info about it we have and the sooner we get it
        toiterate_dict = {key: toiterate_dict[key] for key in keys}
        for entity in toiterate_dict:
            found_types = []
            logger.debug(f"Examine {entity}")
            logger.debug(found_types)
            if "types_2hop" in toiterate_dict[entity]:
                found_types.extend([j[0] for j in toiterate_dict[entity]["types_2hop"] if j[0] not in found_types])
            logger.debug(found_types)
            if "instance of" in toiterate_dict[entity]:
                found_types.extend([j[0] for j in toiterate_dict[entity]["instance of"] if j[0] not in found_types])
            logger.debug(found_types)
            if not any([j in types for j in found_types]):
                logger.warning(f"Querying wikidata for {entity}")
                found_types = []
                for type_ in types:
                    request_answer = request_triples_wikidata(
                        "check_triplet",
                        [(entity, "P31", "forw")],
                        query_dict=book_query_dict,
                    )
                    if isinstance(request_answer, list) and request_answer[0]:
                        found_types.append(type_)
            logger.debug(f"Found types {found_types}")
            logger.debug(f"Interception {[k for k in types if k in found_types]}")
            if any([j in types for j in found_types]):
                logger.debug(f"{mode} found")
                found_entity = entity
                if "plain_entity" not in toiterate_dict[entity]:
                    logger.warning(f"No plain_entity found in annotation for {entity}")
                    plain_entities, _ = request_entities_entitylinking(entity, types=types, confidence_threshold=0.05)
                    plain_entity = plain_entities[0]
                else:
                    plain_entity = toiterate_dict[entity]["plain_entity"]
                if mode == "book":
                    if "publication date" in toiterate_dict[entity]:
                        publication_year = toiterate_dict[entity]["publication date"][0][0]
                    else:
                        logger.warning("No publication date found in annotation for {entity}")
                        publication_year = get_published_year(plain_entity)
                    n_years_ago = get_n_years(publication_year)
                elif mode == "movie":
                    if "film director" in toiterate_dict[entity]:
                        film_director = toiterate_dict[entity]["film producer"][0][0]
                    else:
                        film_director = get_author(plain_entity, mode="movie")
                elif mode == "author":  # to get rid of abbreviations such as J R R Tolkien
                    found_entity = " ".join([k for k in found_entity.split(" ") if len(k) > 1])
                    if "notable work" in toiterate_dict[entity]:
                        attribute = random.choice(toiterate_dict[entity]["notable work"])[1]
                break
            else:
                logger.info(f"No interception with {types}")
    except Exception as e:
        sentry_sdk.capture_exception(e)
        logger.exception(e)
        return None
    entity = found_entity
    attribute = film_director if mode == "movie" else n_years_ago
    logger.info(f"Answer for get_name {entity} {attribute}")
    return entity, plain_entity, attribute
Example #9
0
def author_genres(plain_author_name: str) -> list:
    plain_genres = request_triples_wikidata(
        "find_object", [(plain_author_name, "P136", "forw")], query_dict=book_query_dict
    )
    return list(map(entity_to_label, plain_genres))