def get_author(plain_entity: str, return_plain=False, mode="book") -> Optional[str]: """ Get the author for a plain book entity """ logger.info(f"Calling get_author for {plain_entity}") logger.debug(f"Search author with entity {plain_entity.upper()}") if mode == "book": author_list = request_triples_wikidata( "find_object", [ (plain_entity.upper(), "P50", "forw"), (plain_entity.upper(), "P800", "backw"), ], query_dict=book_query_dict, ) else: author_list = request_triples_wikidata("find_object", [(plain_entity.upper(), "P57", "forw")], query_dict={}) logger.info(f"Author list received {author_list}") author_list = list(itertools.chain.from_iterable(author_list)) author_list = list(set(author_list)) author_list = [x[x.find("Q") :] for x in author_list] # to unify representations sorted_author_list = sorted(author_list, key=lambda x: int(x[1:])) # Sort entities by frequency if not sorted_author_list: return None author_entity = sorted_author_list[0] if return_plain: logger.info(f"Answer {author_entity}") return author_entity if is_wikidata_entity(author_entity): author_name = entity_to_label(author_entity) logger.info(f"Answer for get_author {author_name}") return author_name else: logger.warning(f"Wrong entity {author_entity}") return None
def what_is_book_about(book: Optional[str] = None) -> Optional[str]: """ Fetch facts for a book """ if not book: return None fact = None logger.info(f"Requesting for {book}") if is_wikidata_entity(book): plain_books = [book] else: plain_books, _ = request_entities_entitylinking(book, types=BOOK_WIKI_TYPES) logger.info(f"After request {plain_books}") if plain_books: plain_book = plain_books[0] subjects = request_triples_wikidata("find_object", [(plain_book, "P921", "forw")], query_dict={})[0] if subjects: fact = f"The main subject of this book is {entity_to_label(subjects[0])}." locations = request_triples_wikidata("find_object", [(plain_book, "P840", "forw")], query_dict={})[0] if len(locations) > 1: fact = f"{fact} Apart from other locations," if locations: fact = f"{fact} The action of this book takes place in {entity_to_label(locations[0])}." if not subjects or not locations: characters = request_triples_wikidata("find_object", [(plain_book, "P674", "forw")], query_dict={})[0] if characters: fact = f"{fact} One of the main characters of this book is {entity_to_label(characters[0])}." logger.info(f"Final fact {fact}") return fact
def get_published_year(book_entity: str) -> Optional[str]: """ Extract the publication date """ assert is_wikidata_entity(book_entity) book_entity = book_entity.strip() published_year = None published_year_list = request_triples_wikidata( "find_object", [(book_entity, "P577", "forw")], query_dict=book_query_dict ) logger.info(f"Received {published_year_list}") if isinstance(published_year_list, str): published_year = published_year_list else: while published_year_list and type(published_year_list[0]) == list: # Support different formats of wikiparser output published_year_list = published_year_list[0] if published_year_list and type(published_year_list[0]) == str: published_year = published_year_list[0] else: return None year_candidates = re.findall(r"[\d]{3,4}", published_year) if year_candidates: try: published_year: str = get_n_years(year_candidates[0]) # Changed to return a string assert published_year except Exception: # sentry_sdk.capture_exception(e) logger.exception(f"Could not obtain published year from {published_year_list}") return None logger.info(f"Answer for get_published_year {published_year}") return published_year
def get_plain_genres(plain_bookname: str) -> list: plain_genres = request_triples_wikidata( "find_object", [(plain_bookname, "P136", "forw")], query_dict=book_query_dict ) MAX_DEPTH = 5 for _ in range(MAX_DEPTH): if plain_genres and isinstance(plain_genres[0], list): plain_genres = plain_genres[0] logger.debug(f"Plain_genres {plain_genres}") return plain_genres
def get_booklist(plain_author_name: str) -> str: book_list = request_triples_wikidata( "find_object", [(plain_author_name, "P800", "forw"), (plain_author_name, "P50", "backw")], query_dict=book_query_dict, ) book_list = list(itertools.chain.from_iterable(book_list)) book_list = list(set(book_list)) book_list = [x[x.find("Q") :] for x in book_list if x] # to unify representations book_list = sorted(book_list, key=lambda x: int(x[1:])) return book_list
def get_top_people_from_wiki_for_cobot_topic(cobot_topic, top_people): raw_occupations_list = common_gossip.COBOT_TOPICS_TO_WIKI_OCCUPATIONS[ cobot_topic] processed_occupations_tuple = tuple( [occupation_item[1] for occupation_item in raw_occupations_list]) results = custom_requests.request_triples_wikidata( "find_top_people", [processed_occupations_tuple]) results = results[0] if results else results if results: # if person is actually a ['Wikidata_ID', 'Display_Name'] return [person_item[1] for person_item in results[0][0] if person_item] else: return []
def entity_to_label(entity): """ Args: entity: Wikidata entity for which we need to receive the label If should be string, with first letter Q and other from 0 to 9, like Q5321 Returns: label: label from this entity. If entity is in wrong format we assume that it is already label but give exception """ logger.debug(f"Calling entity_to_label for {entity}") no_entity = not entity wrong_entity_type = not isinstance(entity, str) wrong_entity_format = entity and (entity[0] != "Q" or any( [j not in "0123456789" for j in entity[1:]])) if no_entity or wrong_entity_type or wrong_entity_format: warning_text = f"Wrong entity format. We assume {entity} to be label but check the code" sentry_sdk.capture_exception(Exception(warning_text)) logger.exception(warning_text) return entity label = "" labels = request_triples_wikidata("find_label", [(entity, "")]) try: sep = '"' if sep in labels[0]: label = labels[0].split('"')[1] else: label = labels[0] logger.debug(f"Answer {label}") except Exception as e: sentry_sdk.capture_exception(e) logger.exception( Exception(e, "Exception in conversion of labels {labels}")) return label
def get_name( annotated_phrase: dict, mode="author", ) -> Optional[Tuple[str, str, str]]: """ Extract wiki entities of the specified type """ plain_entity, found_entity, n_years_ago, attribute, film_director = ( None, None, None, None, None, ) try: all_found_entities = get_raw_entity_names_from_annotations(annotated_phrase.get("annotations", {})) if not all_found_entities: return None logger.info(f"Found entities in annotations {all_found_entities}") if mode == "author": types = AUTHOR_WIKI_TYPES elif mode == "book": types = BOOK_WIKI_TYPES elif mode == "movie": types = MOVIE_WIKI_TYPES else: raise Exception(f"Wrong mode: {mode}") n_years_ago = None wp_annotations = annotated_phrase.get("annotations", {}).get("wiki_parser", {}) if isinstance(wp_annotations, list): wp_annotations = wp_annotations[0] toiterate_dict = wp_annotations.get("topic_skill_entities_info", {}) for key in wp_annotations.get("entities_info", {}): if key not in toiterate_dict: toiterate_dict[key] = wp_annotations["entities_info"][key] keys = sorted(toiterate_dict, key=lambda x: -len(str(toiterate_dict[x]))) # logger.debug(toiterate_dict) # To discern omonyms ( e.g serbian old king Stephen and Stephen King) # we sort by the length of wikidata dict - # the more popular is the person the more info about it we have and the sooner we get it toiterate_dict = {key: toiterate_dict[key] for key in keys} for entity in toiterate_dict: found_types = [] logger.debug(f"Examine {entity}") logger.debug(found_types) if "types_2hop" in toiterate_dict[entity]: found_types.extend([j[0] for j in toiterate_dict[entity]["types_2hop"] if j[0] not in found_types]) logger.debug(found_types) if "instance of" in toiterate_dict[entity]: found_types.extend([j[0] for j in toiterate_dict[entity]["instance of"] if j[0] not in found_types]) logger.debug(found_types) if not any([j in types for j in found_types]): logger.warning(f"Querying wikidata for {entity}") found_types = [] for type_ in types: request_answer = request_triples_wikidata( "check_triplet", [(entity, "P31", "forw")], query_dict=book_query_dict, ) if isinstance(request_answer, list) and request_answer[0]: found_types.append(type_) logger.debug(f"Found types {found_types}") logger.debug(f"Interception {[k for k in types if k in found_types]}") if any([j in types for j in found_types]): logger.debug(f"{mode} found") found_entity = entity if "plain_entity" not in toiterate_dict[entity]: logger.warning(f"No plain_entity found in annotation for {entity}") plain_entities, _ = request_entities_entitylinking(entity, types=types, confidence_threshold=0.05) plain_entity = plain_entities[0] else: plain_entity = toiterate_dict[entity]["plain_entity"] if mode == "book": if "publication date" in toiterate_dict[entity]: publication_year = toiterate_dict[entity]["publication date"][0][0] else: logger.warning("No publication date found in annotation for {entity}") publication_year = get_published_year(plain_entity) n_years_ago = get_n_years(publication_year) elif mode == "movie": if "film director" in toiterate_dict[entity]: film_director = toiterate_dict[entity]["film producer"][0][0] else: film_director = get_author(plain_entity, mode="movie") elif mode == "author": # to get rid of abbreviations such as J R R Tolkien found_entity = " ".join([k for k in found_entity.split(" ") if len(k) > 1]) if "notable work" in toiterate_dict[entity]: attribute = random.choice(toiterate_dict[entity]["notable work"])[1] break else: logger.info(f"No interception with {types}") except Exception as e: sentry_sdk.capture_exception(e) logger.exception(e) return None entity = found_entity attribute = film_director if mode == "movie" else n_years_ago logger.info(f"Answer for get_name {entity} {attribute}") return entity, plain_entity, attribute
def author_genres(plain_author_name: str) -> list: plain_genres = request_triples_wikidata( "find_object", [(plain_author_name, "P136", "forw")], query_dict=book_query_dict ) return list(map(entity_to_label, plain_genres))