Python chunks Beispiele, shared.utils.chunks Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def get_unit_symbols(qids: List[str]) -> List[Dict]:
    """Function to get the unit symbols from the unit entities

    Args:
        qids: List of qids

    Returns:
        List of dicts containing the unit id and their unit symbol in english language
    """
    print(datetime.datetime.now(), f"Starting with unit symbols")
    print(f"Total unit symbols to extract: {len(qids)}")
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    id_chunks = chunks(list(qids), chunk_size)
    for chunk in id_chunks:
        query_result = wikidata_entity_request(chunk,
                                               props=[CLAIMS],
                                               timeout=10)

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue

            unit_symbol = map_wd_attribute.try_get_unit_symbol(
                result, PROPERTY_NAME_TO_PROPERTY_ID[UNIT_SYMBOL], UNIT_SYMBOL)

            subject_dict = {ID: qid, UNIT_SYMBOL: unit_symbol}
            extract_dicts.append(subject_dict)

        item_count += len(chunk)
        print(f"Status of unit symbols: {item_count}/{len(qids)}",
              end="\r",
              flush=True)

    print(datetime.datetime.now(), f"Finished with unit symbols")
    return extract_dicts

Beispiel #2

0

Datei anzeigen

def add_wikipedia_extracts(language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
        ARTWORK[PLURAL],
        MOTIF[PLURAL],
        GENRE[PLURAL],
        MATERIAL[PLURAL],
        MOVEMENT[PLURAL],
        ARTIST[PLURAL],
        LOCATION[PLURAL],
        CLASS[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"), encoding="utf-8"
            ) as file:
                if RECOVER_MODE and check_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename):
                    continue
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item)
                        for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )

                    # retry operation until its done
                    done = False
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20

                    while not done:
                        try:
                            item_indices_chunks = chunks(
                                item_indices_with_wiki_link_for_lang, chunk_size
                            )
                            extracted_count = 0
                            # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                            for j in range(len(items)):
                                if j not in item_indices_with_wiki_link_for_lang:
                                    items[j][f"{ABSTRACT}_{key}"] = ""

                            for chunk in item_indices_chunks:
                                # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                                page_id_indices_dictionary = get_wikipedia_page_ids(
                                    items, chunk, key
                                )
                                # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                                raw_response = get_wikipedia_extracts(
                                    items, page_id_indices_dictionary, key
                                )
                                # add extracted abstracts to json objects
                                for i in chunk:
                                    items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                                extracted_count += len(chunk)
                                print(
                                    f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                                    end="\r",
                                    flush=True,
                                )

                                # If a chunk is finished and the chunk size is < 20 (e.g. the previous chunk failed but the current one succeeded): increase the chunk size
                                chunk_size = chunk_size + 5 if chunk_size < 20 else chunk_size


                            # set done to true after all items have been processed
                            done = True
                        except Exception as error:
                            logger.error(f"Fetching wikipedia extracs for {filename}, lang:{key} and chunk size:{chunk_size} failed!")
                            logger.error(error)

                            # lower chunk size and try again in while loop
                            chunk_size -= 5
                            if chunk_size > 0:
                                logger.info(f"Trying the wikipedia extracts again with chunk size:{chunk_size}")
                                continue
                            else:
                                logger.exception(error)
                                raise error

            # overwrite file
            with open(
                    (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                json.dump(items, file, ensure_ascii=False)
            write_state(ETL_STATES.GET_WIKIPEDIA_EXTRACTS.EXTRACT_ABSTRACTS + filename)

        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )

Beispiel #3

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def get_exhibition_entities(
    qids: Set[str],
    language_keys: Optional[List[str]] = lang_keys,
    type_name: str = EXHIBITION,
) -> Dict[str, Dict]:
    """Function to get the exhibition entities from wikidata

    Args:
        qids: Distinct qid set to get the entities from
        language_keys: Language keys to extract label and description from. Defaults to languageconfig.csv
        type_name: OAB type name. Defaults to EXHIBITION.

    Returns:
        A dict with the qids as key and the JSON object as value
    """
    print(datetime.datetime.now(), f"Starting with exhibition entities")
    print(f"Total exhibition entities to extract: {len(qids)}")
    item_count = 0
    extract_dicts = {}
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    id_chunks = chunks(list(qids), chunk_size)
    for chunk in id_chunks:
        query_result = wikidata_entity_request(chunk)
        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue
            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)
            start_time = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[START_TIME], type_name)
            end_time = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[END_TIME], type_name)

            extract_dicts.update({
                qid: {
                    LABEL[SINGULAR]: label,
                    DESCRIPTION[SINGULAR]: description,
                    START_TIME: start_time,
                    END_TIME: end_time,
                    TYPE: EXHIBITION,
                }
            })

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                extract_dicts[qid][f"{LABEL[SINGULAR]}_{langkey}"] = label_lang
                extract_dicts[qid][
                    f"{DESCRIPTION[SINGULAR]}_{langkey}"] = description_lang

        item_count += len(chunk)
        print(
            f"Status of exhibition entities: {item_count}/{len(qids)}",
            end="\r",
            flush=True,
        )

    print(datetime.datetime.now(), f"Finished with exhibition entities")
    return extract_dicts

Beispiel #4

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def get_entity_labels(
    type_name: str,
    qids: List[str],
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Function to get the entity labels from wikidata

    Args:
        type_name: oab type e. g. movement
        qids: List of qids to extract the labels from
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv

    Returns:
        List of dicts containing the qid and the labels for each language
    """
    print(datetime.datetime.now(),
          f"Starting with {type_name} {LABEL[PLURAL]}")
    print(f"Total {type_name} {LABEL[PLURAL]} to extract: {len(qids)}")
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    id_chunks = chunks(list(qids), chunk_size)
    for chunk in id_chunks:
        query_result = wikidata_entity_request(
            chunk, props=[LABEL[PLURAL]],
            timeout=10)  # country entities take longer so timeout is increased

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue

            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            subject_dict = {
                ID: qid,
                LABEL[SINGULAR]: label,
            }

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                subject_dict.update(
                    {f"{LABEL[SINGULAR]}_{langkey}": label_lang})
            extract_dicts.append(subject_dict)

        item_count += len(chunk)
        print(
            f"Status of {type_name} {LABEL[PLURAL]}: {item_count}/{len(qids)}",
            end="\r",
            flush=True,
        )

    print(datetime.datetime.now(),
          f"Finished with {type_name} {LABEL[PLURAL]}")
    return extract_dicts

Beispiel #5

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def get_classes(
    type_name: str,
    qids: List[str],
    already_extracted_superclass_ids: Set[str] = set(),
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Function to extract the classes of the extracted wikidata entities (meaning the 'instance of' attribute wikidata entity qids).
    Their subclasses are also extracted recursively (also called transitive closure)

    Args:
        type_name: oab type e. g. movement
        qids: List of qids to extract the labels from
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
        already_extracted_superclass_ids: A list of already extracted superclass ids for the recursive calls,
        this is also the anchor to stop recursion

    Returns:
        Returns a list of dicts with the classes from the oab entities and their subclasses
    """
    print(datetime.datetime.now(), f"Starting with {type_name}")
    if type_name == CLASS[PLURAL]:
        print(
            f"Total {type_name} to extract (only 'instance_of' of the provided qids): {len(qids)}"
        )
    else:
        print(
            f"Total {type_name} to extract (only 'subclass_of' of the provided qids): {len(qids)}"
        )
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    classes_id_chunks = chunks(list(qids), chunk_size)
    for chunk in classes_id_chunks:
        query_result = wikidata_entity_request(chunk)

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
            except Exception as error:
                logger.error(
                    "Error on qid, skipping item. Error: {0}".format(error))
                continue
            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)
            subclass_of = map_wd_attribute.try_get_qid_reference_list(
                result, PROPERTY_NAME_TO_PROPERTY_ID[SUBCLASS_OF], type_name)
            class_dict = {
                ID: qid,
                LABEL[SINGULAR]: label,
                DESCRIPTION[SINGULAR]: description,
                SUBCLASS_OF: subclass_of,
            }

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                class_dict.update({
                    f"{LABEL[SINGULAR]}_{langkey}":
                    label_lang,
                    f"{DESCRIPTION[SINGULAR]}_{langkey}":
                    description_lang,
                })
            extract_dicts.append(class_dict)

        item_count += len(chunk)
        print(f"Status of {type_name}: {item_count}/{len(qids)}",
              end="\r",
              flush=True)

    return load_entities_by_attribute_with_transitive_closure(
        extract_dicts,
        SUBCLASS_OF,
        CLASS[PLURAL],
        already_extracted_superclass_ids,
        get_classes,
        [],
    )

Beispiel #6

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def get_subject(
    type_name: str,
    qids: List[str],
    already_extracted_movement_ids: Set[str] = set(),
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Extract subjects (in our definition everything except artworks e. g. movements, motifs, etc.) from wikidata

    Args:
        type_name: oab type name e. g. movements (Caution type names are always plural here)
        qids: A list of qids extracted from the artworks
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv

    Returns:
        A list of dicts with the subjects transformed from wikidata entities to oab entities
    """
    print(datetime.datetime.now(), f"Starting with {type_name}")
    print(f"Total {type_name} to extract: {len(qids)}")
    item_count = 0
    extract_dicts = []
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    subject_id_chunks = chunks(list(qids), chunk_size)
    for chunk in subject_id_chunks:
        query_result = wikidata_entity_request(chunk)

        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            subject_dict = map_wd_response.try_map_response_to_subject(
                result, type_name)
            if subject_dict is None:
                continue
            if type_name == MOVEMENT[PLURAL] or type_name == ARTIST[PLURAL]:
                influenced_by = map_wd_attribute.try_get_qid_reference_list(
                    result, PROPERTY_NAME_TO_PROPERTY_ID[INFLUENCED_BY],
                    type_name)
                subject_dict.update({INFLUENCED_BY: influenced_by})
            if type_name == MOVEMENT[PLURAL]:
                subject_dict.update(
                    map_wd_response.try_map_response_to_movement(result))
                already_extracted_movement_ids.add(subject_dict[ID])
            if type_name == ARTIST[PLURAL]:
                subject_dict.update(
                    map_wd_response.try_map_response_to_artist(result))
            if type_name == LOCATION[PLURAL]:
                subject_dict.update(
                    map_wd_response.try_map_response_to_location(result))
            extract_dicts.append(subject_dict)

        item_count += len(chunk)
        print(f"Status of {type_name}: {item_count}/{len(qids)}",
              end="\r",
              flush=True)

    if type_name == MOVEMENT[PLURAL]:
        extract_dicts = load_entities_by_attribute_with_transitive_closure(
            extract_dicts,
            PART_OF,
            MOVEMENT[PLURAL],
            already_extracted_movement_ids,
            get_subject,
            [ART_MOVEMENT[ID], ART_STYLE[ID]],
        )
        extract_dicts = load_entities_by_attribute_with_transitive_closure(
            extract_dicts,
            HAS_PART,
            MOVEMENT[PLURAL],
            already_extracted_movement_ids,
            get_subject,
            [ART_MOVEMENT[ID], ART_STYLE[ID]],
        )
        return extract_dicts

    print(datetime.datetime.now(), f"Finished with {type_name}")
    return extract_dicts

Beispiel #7

0

Datei anzeigen

Datei: load_wd_entities.py Projekt: hochschule-darmstadt/openartbrowser

def extract_artworks(
    type_name: str,
    wikidata_id: str,
    already_crawled_wikidata_items: Set,
    dev_mode: bool,
    dev_chunk_limit: int,
    language_keys: Optional[List[str]] = lang_keys,
) -> List[Dict]:
    """Extracts artworks metadata from Wikidata and stores them in a dictionary.

    Args:
        type_name: Type name of an artwork e. g. 'drawings'. Important for console output
        wikidata_id: Wikidata Id of a class; all instances of this class and all subclasses with image will be loaded. See artworks_ids_query.sparql
        already_crawled_wikidata_items: Set of all already crawled artwork items. Because the types have common items it is necessary to avoid loading items multiple times
        language_keys: All language keys which should be extracted. Defaults to languageconfig.csv
        dev_mode: To reduce the number of loaded chunks set this to true
        dev_chunk_limit: Limit of chunks per category
    Returns:
        A list with all artwork entity dicts (or JSON-objects) which are transformed for the OAB

    Examples:
        extract_artworks('drawings', 'wd:Q93184', '('en', 'de'))
        extract_artworks('sculptures', 'wd:Q860861', '('en', 'de'))
        extract_artworks('paintings', 'wd:Q3305213', '('en', 'de'))
    """
    print(datetime.datetime.now(), "Starting with", type_name)

    extract_dicts = []
    chunk_count = 0
    item_count = 0
    artwork_ids = query_artwork_qids(type_name, wikidata_id)

    # Don't load items again, if they were loaded in another artwork category
    for artwork_id in artwork_ids:
        if artwork_id in already_crawled_wikidata_items:
            artwork_ids.remove(artwork_id)

    print(
        f"{len(artwork_ids)} {type_name} entries are not loaded yet, starting now. Already crawled item count is {len(already_crawled_wikidata_items)}"
    )
    chunk_size = 50  # The chunksize 50 is allowed by the wikidata api, bigger numbers need special permissions
    artwork_id_chunks = chunks(artwork_ids, chunk_size)
    for chunk in artwork_id_chunks:
        if dev_mode and chunk_count == dev_chunk_limit:
            logger.info(
                f"DEV_CHUNK_LIMIT of {type_name} reached. End extraction for {type_name}"
            )
            break

        query_result = wikidata_entity_request(chunk)
        if ENTITIES not in query_result:
            logger.error("Skipping chunk")
            continue

        for result in query_result[ENTITIES].values():
            try:
                qid = result[ID]
                image = map_wd_attribute.get_image_url_by_name(
                    result[CLAIMS][PROPERTY_NAME_TO_PROPERTY_ID[IMAGE]][0]
                    [MAINSNAK][DATAVALUE][VALUE])
            except Exception as error:
                logger.error(
                    "Error on qid or image, skipping item. Qid: {0}, Image: {1}, Error: {2}"
                    .format(qid, image, error))
                continue

            label = map_wd_attribute.try_get_label_or_description(
                result, LABEL[PLURAL], EN, type_name)
            description = map_wd_attribute.try_get_label_or_description(
                result, DESCRIPTION[PLURAL], EN, type_name)

            (
                classes,
                artists,
                locations,
                genres,
                movements,
                materials,
                motifs,
                main_subjects,
                exhibition_history,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [
                    CLASS[SINGULAR],
                    ARTIST[SINGULAR],
                    LOCATION[SINGULAR],
                    GENRE[SINGULAR],
                    MOVEMENT[SINGULAR],
                    MATERIAL[SINGULAR],
                    MOTIF[SINGULAR],
                    MAIN_SUBJECT[SINGULAR],
                    EXHIBITION_HISTORY,
                ],
                type_name,
                map_wd_attribute.try_get_qid_reference_list,
            )

            iconclasses = map_wd_attribute.try_get_value_list(
                result, PROPERTY_NAME_TO_PROPERTY_ID[ICONCLASS[SINGULAR]],
                type_name)
            inception = map_wd_attribute.try_get_year_from_property_timestamp(
                result, PROPERTY_NAME_TO_PROPERTY_ID[INCEPTION], type_name)
            country = map_wd_attribute.try_get_first_qid(
                result, PROPERTY_NAME_TO_PROPERTY_ID[COUNTRY], type_name)

            # Resolve dimensions
            # The units are qids which have to be resolved later
            (
                height,
                width,
                length,
                diameter,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [HEIGHT, WIDTH, LENGTH, DIAMETER],
                type_name,
                map_wd_attribute.try_get_dimension_value,
            )
            (
                height_unit,
                width_unit,
                length_unit,
                diameter_unit,
            ) = map_wd_attribute.get_attribute_values_with_try_get_func(
                result,
                [HEIGHT, WIDTH, LENGTH, DIAMETER],
                type_name,
                map_wd_attribute.try_get_dimension_unit,
            )

            significant_events = map_wd_attribute.try_get_significant_events(
                result)

            artwork_dictionary = {
                ID: qid,
                CLASS[PLURAL]: classes,
                LABEL[SINGULAR]: label,
                DESCRIPTION[SINGULAR]: description,
                IMAGE: image,
                ARTIST[PLURAL]: artists,
                LOCATION[PLURAL]: locations,
                GENRE[PLURAL]: genres,
                MOVEMENT[PLURAL]: movements,
                INCEPTION: inception,
                MATERIAL[PLURAL]: materials,
                MOTIF[PLURAL]: motifs,
                COUNTRY: country,
                HEIGHT: height,
                HEIGHT_UNIT: height_unit,
                WIDTH: width,
                WIDTH_UNIT: width_unit,
                LENGTH: length,
                LENGTH_UNIT: length_unit,
                DIAMETER: diameter,
                DIAMETER_UNIT: diameter_unit,
                ICONCLASS[PLURAL]: iconclasses,
                MAIN_SUBJECT[PLURAL]: main_subjects,
                EXHIBITION_HISTORY: exhibition_history,
                SIGNIFICANT_EVENT: significant_events,
                TYPE: ARTWORK[SINGULAR],
            }

            # Apply blocklist to artwork dictionary
            for t in [
                    CLASS[PLURAL], ARTIST[PLURAL], LOCATION[PLURAL],
                    GENRE[PLURAL], MOVEMENT[PLURAL], MATERIAL[PLURAL],
                    MOTIF[PLURAL], ICONCLASS[PLURAL], MAIN_SUBJECT[PLURAL],
                    EXHIBITION_HISTORY
            ]:
                try:
                    artwork_dictionary[t] = list(
                        set(artwork_dictionary[t]) - set(BLOCKLIST))
                except Exception as e:
                    logger.exception(e)
                    continue

            for langkey in language_keys:
                label_lang = map_wd_attribute.try_get_label_or_description(
                    result, LABEL[PLURAL], langkey, type_name)
                description_lang = map_wd_attribute.try_get_label_or_description(
                    result, DESCRIPTION[PLURAL], langkey, type_name)
                wikipedia_link_lang = map_wd_attribute.try_get_wikipedia_link(
                    result, langkey, type_name)
                artwork_dictionary.update({
                    f"{LABEL[SINGULAR]}_{langkey}":
                    label_lang,
                    f"{DESCRIPTION[SINGULAR]}_{langkey}":
                    description_lang,
                    f"{WIKIPEDIA_LINK}_{langkey}":
                    wikipedia_link_lang,
                })
            extract_dicts.append(artwork_dictionary)
            already_crawled_wikidata_items.add(qid)

        item_count += len(chunk)
        print(
            f"Status of {type_name}: {item_count}/{len(artwork_ids)}",
            end="\r",
            flush=True,
        )

        chunk_count += 1

    print(datetime.datetime.now(), "Finished with", type_name)
    return extract_dicts

Beispiel #8

0

Datei anzeigen

def add_wikipedia_extracts(
    language_keys: Optional[List[str]] = lang_keys, ) -> None:
    """Add the wikipedia extracts to the already existing files

    Args:
        language_keys: Language keys to extract wikipedia abstracts for. Defaults to languageconfig.csv
    """
    for filename in [
            ARTWORK[PLURAL],
            MOTIF[PLURAL],
            GENRE[PLURAL],
            MATERIAL[PLURAL],
            MOVEMENT[PLURAL],
            ARTIST[PLURAL],
            LOCATION[PLURAL],
    ]:
        print(
            datetime.datetime.now(),
            "Starting extracting wikipedia extracts with",
            filename,
        )
        try:
            with open((create_new_path(filename)).with_suffix(f".{JSON}"),
                      encoding="utf-8") as file:
                items = json.load(file)
                for key in language_keys:
                    item_indices_with_wiki_link_for_lang = [
                        items.index(item) for item in items
                        if item[f"{WIKIPEDIA_LINK}_{key}"] != ""
                    ]
                    print(
                        f"There are {len(item_indices_with_wiki_link_for_lang)} {key}.wikipedia links within the {len(items)} {filename} items"
                    )
                    # ToDo: The limit for extracts seems to be 20, there is an excontinue parameter which
                    # could be used to increase the performance and load more at once (50 is allowed by the API) if needed
                    # The request method has to be adjusted for this
                    # Further information https://stackoverflow.com/questions/9846795/prop-extracts-not-returning-all-extracts-in-the-wikimedia-api
                    chunk_size = 20
                    item_indices_chunks = chunks(
                        item_indices_with_wiki_link_for_lang, chunk_size)
                    extracted_count = 0
                    # Fill json objects without wikilink to an abstract with empty key-value pairs (could be removed if frontend is adjusted)
                    for j in range(len(items)):
                        if j not in item_indices_with_wiki_link_for_lang:
                            items[j][f"{ABSTRACT}_{key}"] = ""

                    for chunk in item_indices_chunks:
                        # Get PageIds from URL https://en.wikipedia.org/w/api.php?action=query&titles=Jean_Wauquelin_presenting_his_'Chroniques_de_Hainaut'_to_Philip_the_Good
                        page_id_indices_dictionary = get_wikipedia_page_ids(
                            items, chunk, key)
                        # Get Extracts from PageId https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&pageids=70889|1115370
                        raw_response = get_wikipedia_extracts(
                            items, page_id_indices_dictionary, key)
                        # add extracted abstracts to json objects
                        for i in chunk:
                            items[i][f"{ABSTRACT}_{key}"] = raw_response[i]

                        extracted_count += len(chunk)
                        print(
                            f"Extracts for {filename} and language {key} status: {extracted_count}/{len(item_indices_with_wiki_link_for_lang)}",
                            end="\r",
                            flush=True,
                        )

            # overwrite file
            with open(
                (create_new_path(filename)).with_suffix(f".{JSON}"),
                    "w",
                    newline="",
                    encoding="utf-8",
            ) as file:
                file.write(json.dumps(items, ensure_ascii=False))
        except Exception as error:
            print(
                f"Error when opening following file: {filename}. Error: {error}. Skipping file now."
            )
            continue
        print(
            datetime.datetime.now(),
            "Finished extracting wikipedia extracts with",
            filename,
        )