Python flatten_list_of_lists Examples

Programming Language: Python

Namespace/Package Name: heritageconnector.utils.generic

Method/Function: flatten_list_of_lists

Examples at hotexamples.com: 8

Python flatten_list_of_lists - 8 examples found. These are the top rated real world Python examples of heritageconnector.utils.generic.flatten_list_of_lists extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: retrieve.py Project: TheScienceMuseum/heritage-connector

def get_wikidata_fields(
    pids: list,
    qids: list = [],
    id_qid_mapping: dict = {},
    pids_nolabel: list = [],
    replace_values_with_labels: bool = False,
) -> pd.DataFrame:
    """
    Get information for Wikidata items specified by a set of Wikidata QIDs. Return columns specified by a set of Wikidata PIDs.
        Optionally provide an internal ID-QID mapping to get the results grouped by internal ID.

    Args:
        pids (list): list of Wikidata PIDs
        qcodes (list, optional): list of Wikidata QIDs
        id_qcode_mapping (dict, optional): {internal_id: [qids], ...}. ID column is added to returned DataFrame to retain this 
            mapping
        pids_nolabel (list, optional): PIDs for which the value should be returned instead of the label. Any pids not included 
            in `pids` will be added to the final result.
        replace_values_with_labels (bool, optional): whether to replace QIDs with labels for the fields for which labels are 
            retrieved. If False, labelled columns will be of the form "PxyLabel" and the original "Pxy" columns will be kept.
            Defaults to False.

    Returns:
        pd.DataFrame: table of Wikidata results
    """

    all_pids = list(set(pids + pids_nolabel))
    pids_label = list(set(all_pids) - set(pids_nolabel))

    if qids and id_qid_mapping:
        raise ValueError(
            "Only one of qids and id_qid_mapping should be provided.")
    elif id_qid_mapping:
        qids = list(set(flatten_list_of_lists(id_qid_mapping.values())))

    ent = wbentities()
    ent.get_properties(qids, all_pids, pids_label, replace_values_with_labels)
    res_df = (ent.get_results().rename(columns={
        "id": "qid",
        "labels": "label",
        "descriptions": "description"
    }).rename(columns=lambda c: c.replace("claims.", "")
              if c.startswith("claims.") else c))

    # this line checks that all the QIDs that were requested have ended up in the resulting dataframe
    assert len(set(qids)) == len(set(res_df["qid"].tolist()))

    if id_qid_mapping:
        return_df = pd.DataFrame()
        for item_id, item_qids in id_qid_mapping.items():
            tempdf = res_df.loc[res_df["qid"].isin(item_qids)]
            if len(tempdf) > 0:
                tempdf.loc[:, "id"] = item_id
            return_df = return_df.append(tempdf, ignore_index=True)

        return return_df

    else:
        return res_df

Example #2

Show file

    def get_properties(
        self,
        qids: list,
        pids: list,
        pids_to_label: Union[list, str] = None,
        replace_values_with_labels: bool = False,
        page_size: int = 50,
    ) -> pd.DataFrame:
        """
        Get Wikidata properties specified by `pids` and `pids_to_label` for entities specified by `qids`.

        Args:
            qids (list): list of Wikidata entities
            pids (list): list of Wikidata properties
            pids_to_label (Union[list, str], optional): list of Wikidata properties to get labels for (if their values are entities).
                Use "all" to get labels for all PIDs; None to get labels for no PIDs; or a list of PIDs to get labels for a subset of PIDs.
                If any PIDs in `pids_to_label` aren't in `pids`, values will still be returned for them. Defaults to None.
            page_size (int, optional): page size for API calls. Defaults to 50.

        Returns:
            pd.DataFrame: table of specified property values for entities, with null values specified by empty strings.
        """
        res_generator = self.ge.result_generator(
            qids, page_limit=page_size, timeout=self.timeout
        )

        if pids_to_label is not None:
            if isinstance(pids_to_label, list):
                pids_all = list(set(pids + pids_to_label))
            elif pids_to_label == "all":
                pids_all = list(set(pids))
                pids_to_label = pids_all
        else:
            pids_all = list(set(pids))

        docs = flatten_list_of_lists(
            [
                simplify_wbgetentities_result(
                    doc, lang="en", properties=pids_all, use_redirected_qid=False
                )
                for doc in res_generator
            ]
        )
        doc_df = pd.json_normalize(docs)

        # add columns with empty string values for any that are missing
        proposed_cols = self._pids_to_df_cols(pids_all)
        actual_cols = [col for col in doc_df.columns if col.startswith("claims")]
        extra_cols = list(set(proposed_cols) - set(actual_cols))

        for c in extra_cols:
            doc_df[c] = ""

        self.doc_df = doc_df

        if pids_to_label is not None:
            self.get_labels_for_properties(
                pids_to_label, replace_qids=replace_values_with_labels
            )

Example #3

Show file

def load_adlib_orgs_data(adlib_people_data_path):
    # identifier in field_mapping
    table_name = "ORGANISATION"

    org_df = pd.read_csv(adlib_people_data_path,
                         low_memory=False,
                         nrows=max_records)

    # PREPROCESS
    org_df = org_df[org_df["type.type"] == "institution"]
    org_df = org_df.rename(columns={"admin.uid": "ID"})
    org_df = org_df.rename(columns={"name.0.value": "PREFERRED_NAME"})
    org_df = org_df.rename(columns={"use.0.summary_title": "SUMMARY_TITLE"})
    org_df = org_df.rename(
        columns={"lifecycle.birth.0.date.0.value": "BIRTH_DATE"})
    org_df = org_df.rename(
        columns={"lifecycle.death.0.date.0.value": "DEATH_DATE"})
    org_df = org_df.rename(columns={"nationality.0": "NATIONALITY"})
    org_df = org_df.rename(columns={"description.0.value": "BIOGRAPHY"})

    org_df["PREFIX"] = people_prefix

    org_df["URI"] = org_df["ID"].apply(lambda i: adlib_people_prefix + str(i))
    # if SUMMARY_TITLE exists, use it as a label over PREFERRED_NAME, then apply PREFERRED_NAME as an alias
    org_df["LABEL"] = org_df.apply(
        lambda row: row["SUMMARY_TITLE"]
        if not pd.isnull(row["SUMMARY_TITLE"]) else row["PREFERRED_NAME"],
        axis=1,
    )
    org_df["ALIAS"] = org_df.apply(
        lambda row: row["PREFERRED_NAME"]
        if not pd.isnull(row["SUMMARY_TITLE"]) else "",
        axis=1,
    )

    # remove newlines and tab chars
    org_df.loc[:, "BIOGRAPHY"] = org_df.loc[:, "BIOGRAPHY"].apply(
        datastore_helpers.process_text)
    org_df["NATIONALITY"] = org_df["NATIONALITY"].apply(
        datastore_helpers.split_list_string)

    # for disambiguating description to work
    org_df["OCCUPATION"] = [["nan"] for _ in range(len(org_df))]
    org_df["BIRTH_PLACE"] = ["nan" for _ in range(len(org_df))]
    org_df["DEATH_PLACE"] = ["nan" for _ in range(len(org_df))]

    org_df.loc[:, "DISAMBIGUATING_DESCRIPTION"] = org_df.apply(
        create_org_disambiguating_description, axis=1)

    org_df["BIRTH_DATE"] = org_df["BIRTH_DATE"].apply(get_year_from_date_value)
    org_df["DEATH_DATE"] = org_df["DEATH_DATE"].apply(get_year_from_date_value)
    org_df["NATIONALITY"] = org_df["NATIONALITY"].apply(
        lambda x: flatten_list_of_lists(
            [datastore_helpers.get_country_from_nationality(i) for i in x]))

    org_df["DATABASE"] = "adlib"

    logger.info("loading adlib orgs data")
    record_loader.add_records(table_name, org_df, add_type=WD.Q43229)

Example #4

Show file

def load_orgs_data(people_data_path):
    # identifier in field_mapping
    table_name = "ORGANISATION"

    org_df = pd.read_csv(people_data_path, low_memory=False, nrows=max_records)
    # TODO: use isIndividual flag here
    org_df = org_df[org_df["GENDER"] == "N"]

    # PREPROCESS
    org_df["URI"] = people_prefix + org_df["LINK_ID"].astype(str)

    org_df[["DESCRIPTION", "NOTE"]] = org_df[["DESCRIPTION",
                                              "NOTE"]].fillna("")
    org_df[["DESCRIPTION",
            "NOTE"]] = org_df[["DESCRIPTION", "NOTE"
                               ]].applymap(datastore_helpers.process_text)
    org_df[["OCCUPATION", "NATIONALITY"
            ]] = org_df[["OCCUPATION", "NATIONALITY"
                         ]].applymap(datastore_helpers.split_list_string)

    newline = " \n "  # can't insert into fstring below
    org_df.loc[:,
               "BIOGRAPHY"] = org_df[["DESCRIPTION", "NOTE"
                                      ]].apply(lambda x: f"{newline.join(x)}"
                                               if any(x) else "",
                                               axis=1)

    org_df["DISAMBIGUATING_DESCRIPTION"] = org_df.apply(
        create_org_disambiguating_description, axis=1)

    org_df["NATIONALITY"] = org_df["NATIONALITY"].apply(
        lambda x: flatten_list_of_lists(
            [datastore_helpers.get_country_from_nationality(i) for i in x]))

    org_df["BIRTH_DATE"] = org_df["BIRTH_DATE"].apply(get_year_from_date_value)
    org_df["DEATH_DATE"] = org_df["DEATH_DATE"].apply(get_year_from_date_value)
    org_df["DATABASE"] = "mimsy"

    logger.info("loading orgs data")
    record_loader.add_records(table_name, org_df)

    # also add type organization (Q43229)
    org_df["type_org"] = qid_to_url("Q43229")
    record_loader.add_triples(org_df,
                              RDF.type,
                              subject_col="URI",
                              object_col="type_org")

    return

Example #5

Show file

def load_blog_data(blog_data_path):
    blog_df = pd.read_json(blog_data_path)
    # blog_df = blog_df.head(100)  # for debugging
    blog_df["links"] = (
        blog_df["links"].apply(lambda i: flatten_list_of_lists(i.values(
        ))).apply(lambda url_list:
                  [normalise_collection_url(url) for url in url_list]))
    blog_df = blog_df.rename(columns={"url": "URI"})
    blog_df["text_by_paragraph"] = blog_df["text_by_paragraph"].apply(
        "\n".join)
    blog_df[["caption", "text_by_paragraph"
             ]] = blog_df[["caption", "text_by_paragraph"
                           ]].applymap(lambda i: process_text(i) if i else i)
    blog_df[["categories", "tags"
             ]] = blog_df[["categories", "tags"
                           ]].applymap(lambda lst: [i.lower() for i in lst])

    logger.info("loading blog data")
    record_loaders["blog"].add_records("BLOG_POST", blog_df)

Example #6

Show file

File: pipelines.py Project: TheScienceMuseum/heritage-connector

    def build_training_data(
        self,
        train: bool,
        page_size: int = 100,
        limit: int = None,
        search_limit=20,
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Get training arrays X, y from all the records in the Heritage Connector index with an existing sameAs
        link to Wikidata.

        Args:
            train (str): whether to build training data (True) or data for inference (False). If True a y vector
                is returned, otherwise one isn't.
            page_size (int, optional): the number of records to fetch from Wikidata per iteration. Larger numbers
                will speed up the process but may cause the SPARQL query to time out. Defaults to 10.
                (TODO: set better default)
            limit (int, optional): set a limit on the number of records to use for training (useful for testing).
                Defaults to None.
            search_limit (int, optional): number of search results to retrieve from the Wikidata dump per record.
                Defaults to 20.

        Returns:
            Tuple[np.ndarray, np.ndarray]: X, y
        """

        predicates = self._get_predicates()
        predicate_pid_mapping = get_wikidata_equivalents_for_properties(
            predicates)
        pids_ignore = (config.PIDS_IGNORE).split(" ")
        pids_categorical = (config.PIDS_CATEGORICAL).split(" ")

        # remove instanceof (P31) and add to end, as the type distance calculations are appended to X last
        predicate_pid_mapping = {
            k: url_to_pid(v)
            for k, v in predicate_pid_mapping.items()
            if v is not None and url_to_pid(v) not in pids_ignore + ["P31"]
        }
        #  TODO: add P279 into here then combine P13 with P279 to form item_instanceof
        pids = list(predicate_pid_mapping.values()) + ["P31", "P279"]
        predicate_pid_mapping.update({RDFS.label: "label"})

        pids_geographical = self._get_geographic_properties(pids)

        X_list = []
        if train:
            y_list = []
        ent_similarity_list = []
        id_pair_list = []

        # get records to process from Elasticsearch
        search = es_text_search(index=config.ELASTIC_SEARCH_WIKI_INDEX)

        if train:
            search_res = self._get_labelled_records_from_sparql_store(limit)
        else:
            search_res = self._get_unlabelled_records_from_sparql_store(limit)

        search_res_paginated = paginate_generator(search_res, page_size)

        total = None if limit is None else math.ceil(limit / page_size)

        # for each record, get Wikidata results and create X: feature matrix and y: boolean vector (correct/incorrect match)
        for item_list in tqdm(search_res_paginated, total=total):
            id_qid_mapping = dict()
            qid_instanceof_mapping = dict()
            batch_instanceof_comparisons = []

            logger.debug("Running search")
            start = time.time()
            for item in item_list:
                # text search for Wikidata matches
                qids, qid_instanceof_temp = search.run_search(
                    item["label"],
                    limit=search_limit,
                    include_aliases=True,
                    return_instanceof=True,
                )
                id_qid_mapping[item["id"]] = qids
                qid_instanceof_mapping.update(qid_instanceof_temp)

            end = time.time()
            logger.debug(f"...search complete in {end-start}s")

            # get Wikidata property values for the batch
            logger.debug("Getting wikidata fields")
            start = time.time()
            wikidata_results_df = get_wikidata_fields(
                pids=pids, id_qid_mapping=id_qid_mapping)
            end = time.time()
            logger.debug(f"...retrieved in {end-start}s")

            wikidata_results_df = self._process_wikidata_results(
                wikidata_results_df)

            logger.debug("Calculating field similarities for batch..")
            # create X array for each record
            for item in item_list:
                # we get all the triples for the item here (rather than each triple in the for loop below)
                # to reduce the load on the SPARQL DB
                try:
                    item_triples = list(
                        self._get_triples_from_store(
                            (URIRef(item["id"]), None, None)))

                except:  # noqa: E722
                    # sparql store has crashed
                    sleep_time = 120
                    logger.debug(
                        f"get_triples query failed for item {item['id']}. Retrying in {sleep_time} seconds"
                    )
                    time.sleep(sleep_time)
                    self._open_sparql_store()
                    item_triples = list(
                        self._get_triples_from_store(
                            (URIRef(item["id"]), None, None)))

                X_temp = []
                qids_wikidata = wikidata_results_df.loc[
                    wikidata_results_df["id"] == item["id"], "qid"]

                if train:
                    item_qid = url_to_qid([
                        i for i in item_triples if i[0][1] == OWL.sameAs
                    ][0][0][-1])
                    y_item = [item_qid == qid for qid in qids_wikidata]

                id_pairs = [[item["id"], qid] for qid in qids_wikidata]

                # calculate instanceof distances
                try:
                    item_instanceof = [
                        url_to_qid(i[0][-1]) for i in item_triples
                        if i[0][1] == RDF.type
                    ]
                    wikidata_instanceof = wikidata_results_df.loc[
                        wikidata_results_df["id"] == item["id"],
                        "P31_and_P279"].tolist()

                    batch_instanceof_comparisons += [(
                        self._to_tuple(item_instanceof),
                        self._to_tuple(url_to_qid(q, raise_invalid=False)),
                    ) for q in wikidata_instanceof]
                except:  # noqa: E722
                    # TODO: better error handling here. Why does this fail?
                    logger.warning("Getting types for comparison failed.")

                    batch_instanceof_comparisons += [
                        (None, None) for q in range(
                            len(wikidata_results_df.loc[
                                wikidata_results_df["id"] == item["id"], :]))
                    ]

                for predicate, pid in predicate_pid_mapping.items():
                    item_values = [
                        i for i in item_triples if i[0][1] == URIRef(predicate)
                    ]

                    # RDFS.label is a special case that has no associated PID. We just want to compare it
                    # to the 'label' column which is the labels + aliases for each Wikidata item.
                    if predicate == RDFS.label:
                        item_labels = [
                            str(triple[0][-1]) for triple in item_values
                        ]
                        wikidata_labels = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            "label"].tolist()
                        sim_list = [
                            similarity_string(item_labels, label_list)
                            for label_list in wikidata_labels
                        ]

                    elif pid in pids_geographical:
                        item_values = self._to_tuple(
                            url_to_qid(
                                [triple[0][-1] for triple in item_values],
                                raise_invalid=False,
                            ))

                        wikidata_values = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid].tolist()

                        if len(item_values) == 0:
                            sim_list = [1] * len(wikidata_values)
                        else:
                            sim_list = [
                                get_distance_between_entities_multiple(
                                    {self._to_tuple(wiki_val), item_values},
                                    vertex_pid="P131",
                                    reciprocal=True,
                                ) for wiki_val in wikidata_values
                            ]

                    else:
                        wikidata_values = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid].tolist()
                        wikidata_labels = wikidata_results_df.loc[
                            wikidata_results_df["id"] == item["id"],
                            pid + "Label"].tolist()

                        if len(item_values) == 0:
                            # if the internal item has no values for the PID return zero similarity
                            # for this PID with each of the candidate QIDs
                            sim_list = [0] * len(wikidata_values)

                        else:
                            item_values = [
                                triple[0][-1] for triple in item_values
                            ]
                            item_values = flatten_list_of_lists([
                                self._replace_internal_id_with_sameas_or_label(
                                    val) if is_internal_uri(val) else val
                                for val in item_values
                            ])

                            if all([not bool(i) for i in item_values]):
                                sim_list = [0] * len(wikidata_values)

                            else:
                                if pid in pids_categorical:
                                    sim_list = [
                                        similarity_categorical(
                                            [str(i) for i in item_values],
                                            label,
                                            raise_on_diff_types=False,
                                        ) for label in wikidata_labels
                                    ]
                                else:
                                    sim_list = [
                                        compare(
                                            item_values,
                                            wikidata_values[i],
                                            wikidata_labels[i],
                                        ) for i in range(len(wikidata_values))
                                    ]

                    X_temp.append(sim_list)

                X_item = np.asarray(X_temp, dtype=np.float32).transpose()

                # TODO (checkpoint): here we would want to save X_list, y_list, id_pair_list, self.entity_distance_cache to disk
                X_list.append(X_item)

                if train:
                    y_list += y_item

                id_pair_list += id_pairs

            self._add_instanceof_distances_to_inmemory_cache(
                batch_instanceof_comparisons)

            for ent_1, ent_2 in batch_instanceof_comparisons:
                ent_similarity_list.append(self.entity_distance_cache[hash(
                    (ent_1, ent_2))])

        if train:
            X = np.column_stack([np.vstack(X_list), ent_similarity_list])
            y = np.asarray(y_list, dtype=bool)
            X_columns = list(predicate_pid_mapping.values()) + ["P31"]

            return X, y, X_columns, id_pair_list

        else:
            X = np.column_stack([np.vstack(X_list), ent_similarity_list])
            X_columns = list(predicate_pid_mapping.values()) + ["P31"]

            return X, X_columns, id_pair_list

Example #7

Show file

def load_people_data(people_data_path):
    """Load data from CSV files """

    # identifier in field_mapping
    table_name = "PERSON"

    people_df = pd.read_csv(people_data_path,
                            low_memory=False,
                            nrows=max_records)
    # TODO: use isIndividual flag here
    people_df = people_df[people_df["GENDER"].isin(["M", "F"])]

    # PREPROCESS
    people_df["URI"] = people_prefix + people_df["LINK_ID"].astype(str)
    # remove punctuation and capitalise first letter
    people_df["TITLE_NAME"] = people_df["TITLE_NAME"].apply(lambda i: str(
        i).capitalize().translate(str.maketrans("", "", string.punctuation)))
    people_df["PREFERRED_NAME"] = people_df["PREFERRED_NAME"].apply(
        reverse_person_preferred_name_and_strip_brackets)
    people_df["OCCUPATION"] = people_df["OCCUPATION"].apply(
        datastore_helpers.split_list_string)
    people_df["NATIONALITY"] = people_df["NATIONALITY"].apply(
        datastore_helpers.split_list_string)
    people_df[["DESCRIPTION", "NOTE"]] = people_df[["DESCRIPTION",
                                                    "NOTE"]].fillna("")

    # remove newlines and tab chars
    people_df.loc[:, ["DESCRIPTION", "NOTE"
                      ]] = people_df.loc[:, ["DESCRIPTION", "NOTE"]].applymap(
                          datastore_helpers.process_text)

    # create combined text fields
    newline = " \n "  # can't insert into fstring below
    people_df.loc[:, "BIOGRAPHY"] = people_df[["DESCRIPTION", "NOTE"]].apply(
        lambda x: f"{newline.join(x)}" if any(x) else "", axis=1)

    people_df["DISAMBIGUATING_DESCRIPTION"] = people_df.apply(
        create_people_disambiguating_description, axis=1)

    # all of these must happen after creating DISAMBIGUATING_DESCRIPTION as they modify the text values of fields
    # that are used
    people_df["NATIONALITY"] = people_df["NATIONALITY"].apply(
        lambda x: flatten_list_of_lists(
            [datastore_helpers.get_country_from_nationality(i) for i in x]))

    people_df["BIRTH_PLACE"] = people_df["BIRTH_PLACE"].apply(
        lambda i: get_wikidata_uri_from_placename(i, False,
                                                  placename_qid_mapping))
    people_df["DEATH_PLACE"] = people_df["DEATH_PLACE"].apply(
        lambda i: get_wikidata_uri_from_placename(i, False,
                                                  placename_qid_mapping))
    people_df["BIRTH_DATE"] = people_df["BIRTH_DATE"].apply(
        get_year_from_date_value)
    people_df["DEATH_DATE"] = people_df["DEATH_DATE"].apply(
        get_year_from_date_value)
    people_df.loc[:, "GENDER"] = people_df.loc[:, "GENDER"].replace({
        "F":
        WD.Q6581072,
        "M":
        WD.Q6581097
    })

    people_df["DATABASE"] = "mimsy"

    logger.info("loading people data")
    record_loader.add_records(table_name, people_df, add_type=WD.Q5)

Example #8

Show file

def load_adlib_people_data(adlib_people_data_path):
    table_name = "PERSON"

    people_df = pd.read_csv(adlib_people_data_path,
                            low_memory=False,
                            nrows=max_records)

    # PREPROCESS
    people_df = people_df[people_df["type.type"] == "person"]
    people_df = people_df.rename(columns={"admin.uid": "ID"})
    people_df = people_df.rename(columns={"name.0.title_prefix": "TITLE_NAME"})
    people_df = people_df.rename(
        columns={"name.0.first_name": "FIRSTMID_NAME"})
    people_df = people_df.rename(columns={"name.0.last_name": "LASTSUFF_NAME"})
    people_df = people_df.rename(columns={"name.0.value": "PREFERRED_NAME"})
    people_df = people_df.rename(
        columns={"lifecycle.birth.0.date.0.value": "BIRTH_DATE"})
    people_df = people_df.rename(
        columns={"lifecycle.death.0.date.0.value": "DEATH_DATE"})
    people_df = people_df.rename(
        columns={"lifecycle.birth.0.place.0.summary_title": "BIRTH_PLACE"})
    people_df = people_df.rename(
        columns={"lifecycle.death.0.place.0.summary_title": "DEATH_PLACE"})
    people_df = people_df.rename(columns={"nationality.0": "NATIONALITY"})
    people_df = people_df.rename(columns={"description.0.value": "BIOGRAPHY"})
    people_df = people_df.rename(columns={"gender": "GENDER"})

    people_df["URI"] = adlib_people_prefix + people_df["ID"].astype(str)
    people_df["NATIONALITY"] = people_df["NATIONALITY"].apply(
        datastore_helpers.split_list_string)

    people_df["PREFERRED_NAME"] = people_df["PREFERRED_NAME"].apply(
        reverse_person_preferred_name_and_strip_brackets)

    # remove newlines and tab chars
    people_df.loc[:, "BIOGRAPHY"] = people_df.loc[:, "BIOGRAPHY"].apply(
        datastore_helpers.process_text)

    people_df.loc[:, "GENDER"] = people_df.loc[:, "GENDER"].replace({
        "female":
        WD.Q6581072,
        "male":
        WD.Q6581097
    })

    people_df["OCCUPATION"] = [["nan"] for _ in range(len(people_df))]
    people_df["CAUSE_OF_DEATH"] = ["nan" for _ in range(len(people_df))]

    people_df["DISAMBIGUATING_DESCRIPTION"] = people_df.apply(
        create_people_disambiguating_description, axis=1)

    people_df["NATIONALITY"] = people_df["NATIONALITY"].apply(
        lambda x: flatten_list_of_lists(
            [datastore_helpers.get_country_from_nationality(i) for i in x]))
    people_df["BIRTH_DATE"] = people_df["BIRTH_DATE"].apply(
        get_year_from_date_value)
    people_df["DEATH_DATE"] = people_df["DEATH_DATE"].apply(
        get_year_from_date_value)
    people_df["BIRTH_PLACE"] = people_df["BIRTH_PLACE"].apply(
        lambda i: get_wikidata_uri_from_placename(i, False,
                                                  adlib_placename_qid_mapping))
    people_df["DEATH_PLACE"] = people_df["DEATH_PLACE"].apply(
        lambda i: get_wikidata_uri_from_placename(i, False,
                                                  adlib_placename_qid_mapping))

    people_df["DATABASE"] = "adlib"

    logger.info("loading adlib people data")
    record_loader.add_records(table_name, people_df, add_type=WD.Q5)