Ejemplo n.º 1
0
    def test2_timeout_nocache(self):

        dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/",
                                 timeout=1,
                                 retries=0)
        query = "SELECT ?label ?uri WHERE { ?uri rdfs:label ?label . filter (str(?label) =\"test\")}"

        capturedOutput = io.StringIO()
        sys.stdout = capturedOutput
        endpoint_wrapper(query, dbpedia, caching=False)

        assert capturedOutput.getvalue() == 'timed out\n'
Ejemplo n.º 2
0
    def test6_prefix_lookup_true(self):

        query = "SELECT DISTINCT ?name WHERE {<http://dbpedia.org/resource/Bavaria> dbp:name ?name }"

        expected_result = pd.DataFrame({"name": ["Free State of Bavaria"]})

        result = endpoint_wrapper(query, DBpedia, prefix_lookup=True)

        pd.testing.assert_frame_equal(result, expected_result, check_like=True)
Ejemplo n.º 3
0
    def test4_initial_offset_nocache(self):

        dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/", page_size=1)

        query = "SELECT DISTINCT ?uri WHERE { ?uri rdfs:label ?label . filter(?label =\"Bayern\"@en)} LIMIT 1 OFFSET 2"

        expected_result = pd.DataFrame(
            {"uri": ["http://www.wikidata.org/entity/Q4874432"]})

        result = endpoint_wrapper(query, dbpedia, caching=True)

        pd.testing.assert_frame_equal(result, expected_result, check_like=True)
Ejemplo n.º 4
0
    def test7_prefix_lookup_json(self):

        query = "SELECT DISTINCT ?homepage WHERE {<http://dbpedia.org/resource/Michael_Wendler> der-wendler:homepage ?homepage }"

        expected_result = pd.DataFrame(
            {"homepage": ["http://www.michaelwendler.de/"]})

        result = endpoint_wrapper(
            query,
            DBpedia,
            prefix_lookup="test/data/sparql_helper/prefixes_test7.json")

        pd.testing.assert_frame_equal(result, expected_result, check_like=True)
Ejemplo n.º 5
0
    def test8_prefix_lookup_dict(self):

        prefix_dict = {"someprefix": "http://www.w3.org/2000/01/rdf-schema#"}
        query = "SELECT DISTINCT ?we_need WHERE {<http://dbpedia.org/resource/Beer> someprefix:label ?we_need}"

        expected_result = pd.DataFrame({
            "we_need": [
                'Bier', 'ビール', 'Beer', 'جعة', 'Cerveza', 'Bière', 'Birra',
                'Bier', 'Piwo', 'Cerveja', 'Пиво', '啤酒'
            ]
        })

        result = endpoint_wrapper(query, DBpedia, prefix_lookup=prefix_dict)

        pd.testing.assert_frame_equal(result, expected_result, check_like=True)
Ejemplo n.º 6
0
    def test1_pagesize(self):

        dbpedia = RemoteEndpoint("http://dbpedia.org/sparql/", page_size=1)
        query = "SELECT DISTINCT ?uri WHERE { ?uri rdfs:label ?label . filter(?label =\"Bayern\"@en)}"

        expected_result = pd.DataFrame({
            "uri": [
                "http://dbpedia.org/resource/Bayern",
                "http://www.wikidata.org/entity/Q255654",
                "http://www.wikidata.org/entity/Q4874432",
                "http://www.wikidata.org/entity/Q18148056"
            ],
        })

        result = endpoint_wrapper(query, dbpedia)

        pd.testing.assert_frame_equal(result, expected_result, check_like=True)
Ejemplo n.º 7
0
def relational_matching(df,
                        endpoints=[DBpedia, WikiData],
                        uri_data_model=False,
                        match_score=1,
                        progress=True,
                        caching=True):
    """Creates a mapping of matching attributes in the schema by checking for
    owl:sameAs, owl:equivalentClass, owl:Equivalent and wdt:P1628 links between 
    them.

    Args:
        df (pd.DataFrame): Dataframe where matching attributes are supposed to 
            be found.
        endpoints (list, optional): SPARQL Endpoint to be queried. Defaults to 
            [DBpedia, WikiData].
        uri_data_model (bool, optional): If enabled, the URI is directly queried
            instead of a SPARQL endpoint. Defaults to False.
        match_score (int, optional): Score of the match: 0 < match_score <= 1. 
            Defaults to 1.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with
        the score, which is always one in case of the relational matching
        unless specified otherwise.
        
    """

    matches = pd.DataFrame(columns=["uri_1", "uri_2", "value"])

    # determine attribute columns
    cat_cols = [col for col in df.columns if re.findall("http:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    if not cat_cols:
        return matches
    # transform attributes to sparql values list form
    values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) "

    if uri_data_model:
        # formulate query
        query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
        query += "SELECT ?value ?object WHERE {VALUES (?value) { (<**URI**>)}"
        query += " ?value\
             (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\
                  ?object. }"

        temp_df = pd.DataFrame(cat_cols_stripped, columns=["values"])
        same_cats = uri_querier(temp_df,
                                "values",
                                query,
                                caching=caching,
                                progress=progress)

        if same_cats.empty:
            return matches
        else:
            same_cats = same_cats.drop(
                same_cats[same_cats["value"] == same_cats["object"]].index)

    else:
        if not isinstance(endpoints, list):
            endpoints = [endpoints]

        same_cats = pd.DataFrame(columns=["value", "object"])

        for endpoint in endpoints:

            # formulate query
            query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
            query += "SELECT  ?value ?object WHERE {VALUES (?value) {"
            query += values
            query += "} ?value\
                 (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\
                      ?object. }"

            # query the equivalent classes/properties
            query_result = endpoint_wrapper(query, endpoint, caching=caching)
            if not query_result.empty:
                query_result = query_result.drop_duplicates().\
                    reset_index(drop=True)

            # group equivalent classes/properties for each original attribute
            same_cats = same_cats.append(query_result, ignore_index=True)

    if same_cats.empty:
        return matches

    combinations = list(itertools.combinations(cat_cols_stripped, 2))
    combinations_sorted = pd.DataFrame([sorted(x) for x in combinations],
                                       columns=["uri_1", "uri_2"])

    # detect matches in the attributes
    for _, row in same_cats.iterrows():
        if row["object"] in cat_cols_stripped:
            # if there is a match insert it in alphabetical order into the
            # output matches dataframe
            new_match = {
                "uri_1": min(row["value"], row["object"]),
                "uri_2": max(row["value"], row["object"]),
                "value": match_score
            }
            matches = matches.append(new_match, ignore_index=True)

    matches = matches.drop_duplicates()
    full_matches = combinations_sorted.merge(matches,
                                             on=["uri_1", "uri_2"],
                                             how="outer")
    full_matches["value"] = np.where(full_matches["value"].isna(), 0,
                                     full_matches["value"])

    return full_matches
Ejemplo n.º 8
0
def custom_sparql_generator(df,
                            link_attribute,
                            query,
                            endpoint=DBpedia,
                            progress=True,
                            attribute_generation_strategy="first",
                            prefix_lookup=False,
                            caching=True):
    """This generator issues a custom SPARQL query and creates additional 
    attributes from the query results.

    Args:
        df (pd.DataFrame): Dataframe to which links are added
        link_attribute (str): Name of column containing the link to the 
            knowledge graph.
        query (str): Custom SPARQL query which returns attributes to be 
            appended.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored
            when "uri_data_model" = True. Defaults to DBpedia.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the query results.
    """

    # TODO: Add attribute generation strategy to Docstring

    variable = re.search(r"\*.*\*", query).group().replace("*", "")

    var_index = df.columns.get_loc(variable)

    df_result = pd.DataFrame()

    if progress:
        iterator = tqdm(df.iterrows(), total=df.shape[0], desc="Row")
    else:
        iterator = df.iterrows()

    for row in iterator:

        query_temp = re.sub(r"\*.*\*", "<" + str(row[1].iloc[var_index]) + ">",
                            query)

        df_temp = pd.DataFrame([row[1].iloc[var_index]],
                               columns=["link_attribute"])

        df_temp = pd.concat([
            df_temp,
            endpoint_wrapper(query_temp, endpoint, caching=caching).head(1)
        ],
                            axis=1)

        df_result = pd.concat([df_result, df_temp],
                              ignore_index=True,
                              sort=True)

    df = pd.merge(df,
                  df_result.drop_duplicates(),
                  left_on=link_attribute,
                  right_on="link_attribute",
                  how="left")
    df.drop("link_attribute", axis=1, inplace=True)

    return df
Ejemplo n.º 9
0
def specific_relation_generator(
        df,
        columns,
        endpoint=DBpedia,
        uri_data_model=False,
        progress=True,
        direct_relation="http://purl.org/dc/terms/subject",
        hierarchy_relation=None,
        max_hierarchy_depth=1,
        prefix_lookup=False,
        caching=True):
    """Creates attributes from a specific direct relation. Additionally, it is
    possible to append a hierarchy with a user-defined hierarchy relation.

    Args:
        df (pd.DataFrame): the dataframe to extend
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly queried
            instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to
            inform the user about the progress made by the process. Defaults 
            to True.
        direct_relation (str, optional): Direct relation used to create
            features. Defaults to "http://purl.org/dc/terms/subject".
        hierarchy_relation (str, optional): Hierarchy relation used to connect 
            categories, e.g. http://www.w3.org/2004/02/skos/core#broader. 
            Defaults to None.
        max_hierarchy_depth (int, optional): Maximal number of hierarchy steps
            taken. Defaults to 1.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: The dataframe with additional features.
    """

    df = df.copy()

    if hierarchy_relation:
        hierarchy_relation = re.sub(r"^.*?https://", "http://",
                                    hierarchy_relation)
        hierarchy = nx.DiGraph()

    direct_relation = re.sub(r"^.*?https://", "http://", direct_relation)

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    if df[columns].isna().all().item():
        return df

    #  iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:
            # Create Sparql Query
            values = "(<" + df[col].str.cat(sep=">) (<") + ">) "
            query = "SELECT  ?value ?object "
            query += " WHERE {VALUES (?value) {" + values
            query += "} ?value (<" + direct_relation + ">) ?object. }"

            # Retrieve query results from endpoint
            query_result = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup, caching=caching).\
                    drop_duplicates().reset_index(drop=True)
        else:
            # Create URI Query
            query = "SELECT ?value ?object WHERE {VALUES (?value) {(<**URI**>)}"
            query += " ?value (<" + direct_relation + ">) ?object. }"

            query_result = uri_querier(df,
                                       col,
                                       query,
                                       prefix_lookup=prefix_lookup,
                                       progress=progress,
                                       caching=caching)

        # delete empty columns (for example when hierarchy relation returns
        # nothing)
        query_result = query_result.dropna(how="all", axis=1)

        # check if there are valid results, if not return the original frame
        if query_result.empty:
            continue

        # extract hierarchy
        if hierarchy_relation:
            hierarchy_col = hierarchy_graph_generator(
                query_result["object"],
                hierarchy_relation=hierarchy_relation,
                max_hierarchy_depth=max_hierarchy_depth,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)
            hierarchy = nx.compose(hierarchy, hierarchy_col)

        query_grouped = query_result.groupby("value")["object"].apply(list)

        # bundle the unique new features
        new_cols = pd.Series(query_grouped.values.sum()).unique()

        # create shape of result dataframe to fill
        df_to_append = pd.DataFrame(columns=new_cols)
        df_to_append["value"] = query_grouped.index

        # check for each URI if it belongs to the category and tick True/False
        for row, new_col in itertools.product(df_to_append.index, new_cols):
            df_to_append.loc[row, new_col] = np.where(
                new_col in query_grouped[df_to_append.loc[row, "value"]], True,
                False).item()

        # merge the new column with the original dataframe
        df_to_append.rename({"value": col}, axis=1, inplace=True)
        df = pd.merge(df, df_to_append, how="left", on=col)

        # rename columns
        if new_cols.any():
            df.columns = [
                col + "_in_boolean_" + name if name in new_cols else name
                for name in df.columns
            ]

    # append hierarchy to df as attribute, this will generate a warning but
    # works
    if hierarchy_relation:
        df.attrs = {"hierarchy": hierarchy}

    return df
Ejemplo n.º 10
0
def qualified_relation_generator(df,
                                 columns,
                                 endpoint=DBpedia,
                                 uri_data_model=False,
                                 progress=True,
                                 prefix="Link",
                                 direction="Out",
                                 properties_regex_filter=None,
                                 types_regex_filter=None,
                                 result_type="boolean",
                                 hierarchy=False,
                                 prefix_lookup=False,
                                 caching=True):
    """Qualified relation generator considers not only relations, but also the 
    related types, adding boolean, counts, relative counts or tfidf-values 
    features for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        properties_regex_filter (str, optional): Regular expression for 
            filtering properties. Defaults to None.
        types_regex_filter (str, optional): Regular expression for filtering 
            types. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph
    """

    df = df.copy()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("?p",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("?type",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("str(?p)",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("str(?type)",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:
        if hierarchy:

            hierarchy_col = hierarchy_graph_generator(
                result_df["type"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        result_df[
            "link_with_type"] = result_df["p"] + "_type_" + result_df["type"]

        result_df = result_df[["value", "link_with_type"]]

        result_df_dummies = result_df.join(
            result_df["link_with_type"].str.get_dummies()).drop(
                "link_with_type", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    if hierarchy:
        # append hierarchy to df as attribute, this will generate a warning but works
        result_df.attrs = {"hierarchy": hierarchyGraph}

    return result_df
Ejemplo n.º 11
0
def unqualified_relation_generator(df,
                                   columns,
                                   endpoint=DBpedia,
                                   uri_data_model=False,
                                   progress=True,
                                   prefix="Link",
                                   direction="Out",
                                   regex_filter=None,
                                   result_type="boolean",
                                   prefix_lookup=False,
                                   caching=True):
    """Unqualified relation generator creates attributes from the existence of 
    relations and adds boolean, counts, relative counts or tfidf-values features
    for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        regex_filter (str, optional): Regular expression for filtering 
            properties. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of 
        properties to the knowledge graph
    """

    df = df.copy()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("?p", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("str(?p)", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:

        result_df_dummies = result_df.join(
            result_df["p"].str.get_dummies()).drop("p", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    return result_df
Ejemplo n.º 12
0
def sameas_linker(
    df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, 
    result_filter=None, uri_data_model=False, bundled_mode=True, 
    prefix_lookup=False, caching=True):
    """Function that takes URIs from a column of a DataFrame and queries a
    given SPARQL endpoint for ressources which are connected to these URIs via
    owl:sameAs. Found ressources are added as new columns to the dataframe and
    the dataframe is returned.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        column (str): Name of the column for whose entities links should be
            found.
        new_attribute_name (str, optional): Name / prefix of the column(s)  
            containing the found links. Defaults to "new_link".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        result_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results. Defaults to None.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        bundled_mode (bool, optional): If True, all necessary queries are   
            boundled into one querie (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation!. Defaults to True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the
        found ressources.
    """

    df = df.copy()

    if bundled_mode and not uri_data_model:

        values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) "

        query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \
            values+"} ?value owl:sameAs ?sameas_uris . "

        if result_filter != None:

            query = query + \
                "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") "

        query = query+"}"

        result_df = endpoint_wrapper(
            query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates()

    else:

        result_df = pd.DataFrame()

        if uri_data_model:

            query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . "

            if result_filter != None:

                query = query + \
                    "FILTER("+regex_string_generator("str(?sameas_uris)",
                                                     result_filter)+") "

            query = query+"}"

            result_df = uri_querier(
                df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching)

        else:

            if progress:
                iterator = tqdm(df[column].iteritems(), total=df.shape[0])
            else:
                iterator = df[column].iteritems()

            for uri in iterator:

                if pd.isna(uri[1]):

                    pass

                else:

                    query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[
                        1]+">"

                    if result_filter != None:

                        query = query + \
                            " && ("+regex_string_generator("?sameas_uris",
                                                           result_filter)+")"

                    query = query+") }"

                    result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching)

                    result_df = result_df.append(result)

        result_df = result_df.rename(
            {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True)

    if result_df.empty:

        df[new_attribute_name+"_1"] = np.nan

        return df

    else:

        result_df_grouped = result_df.groupby("value")

        result_df_grouped = result_df_grouped["sameas_uris"].apply(
            lambda x: pd.Series(x.values)).unstack()
        result_df_grouped = result_df_grouped.rename(
            columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])})

        df = pd.merge(df, result_df_grouped, left_on=column,
                      right_on="value", how="outer")

        return df
Ejemplo n.º 13
0
    def test5_wrong_endpointtype(self):

        with pytest.raises(TypeError):
            endpoint_wrapper("test_query", "http://dbpedia.org/sparql/")
Ejemplo n.º 14
0
def label_schema_matching(df,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          to_lowercase=True,
                          remove_prefixes=True,
                          remove_punctuation=True,
                          prefix_threshold=1,
                          progress=True,
                          caching=True):
    """A schema matching method by checking for attribute -- rdfs:label between 
    links.

    Args:
        df (pd.DataFrame): The dataframe where matching attributes are supposed 
            to be found.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults 
            to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        to_lowercase (bool, optional): Converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): Removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): Removes punctuation from queried
            strings. Defaults to True.
        prefix_threshold (int, optional): The number of occurences after which 
            a prefix is considered "common". Defaults to 1.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with the overlapped label.
    """

    matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"])

    # Get URIs from the column names
    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # transform attributes to sparql values list form
    values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) "

    if uri_data_model:
        # Query these URIs for the label
        query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"
        labels = uri_querier(
            pd.DataFrame(cat_cols_stripped),
            0,
            query,
            progress=progress,
            caching=caching).drop_duplicates().set_index("value")

    else:

        query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \
            "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"

        # query the equivalent classes/properties
        labels = endpoint_wrapper(query, endpoint,
                                  caching=caching).reset_index(drop=True)

    if labels.empty:
        return matches

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)
    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary
    if labels.index.name == "value":
        labels.reset_index(inplace=True)

    labels_dict = labels.set_index("value").T.to_dict("list")

    #check if there are no matches
    tmp = set()
    for v in labels_dict.values():
        tmp.update(v)
    if len(labels_dict) == len(tmp):
        combinations = list(itertools.combinations(cat_cols_stripped, 2))
        combinations_sorted = [sorted(x) for x in combinations]

        matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])
        matches["same_label"] = 0

        return matches

    else:
        # Combine the uris that have the same labels into a DataFrame
        new_labels_dict = collections.defaultdict(list)
        for key, values in labels_dict.items():
            for i in values:
                new_labels_dict[i].append(key)

        df_labels = pd.DataFrame(list(new_labels_dict.values()),
                                 columns=["uri_1", "uri_2"])
        #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys()))
        df_labels.dropna(inplace=True)

        # restrict the order of uris in one row
        for _, row in df_labels.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 1
            }
            matches = matches.append(new_match, ignore_index=True)

        # Get back the uris that are not quired by rdfs:label and turn df into dict
        no_label = pd.DataFrame({
            "value":
            [x for x in cat_cols_stripped if x not in list(labels["value"])],
            "o":
            np.nan
        })
        labels = labels.append(no_label, ignore_index=True)

        full_labels_dict = labels.set_index("value").T.to_dict("list")

        # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame
        combinations = list(itertools.combinations(full_labels_dict.keys(), 2))
        combinations_sorted = [sorted(x) for x in combinations]

        result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

        # merged with the non_matched combinations and drop duplicates
        for _, row in result.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 0
            }
            matches = matches.append(new_match, ignore_index=True)

        matches.drop_duplicates(subset=["uri_1", "uri_2"],
                                inplace=True,
                                ignore_index=True)

        return matches
Ejemplo n.º 15
0
def data_properties_generator(df,
                              columns,
                              endpoint=DBpedia,
                              uri_data_model=False,
                              progress=True,
                              type_filter=None,
                              regex_filter=None,
                              bundled_mode=True,
                              prefix_lookup=False,
                              caching=True):
    """Generator that takes a dataset with a link to a knowledge graph and 
    creates a new feature for each data property of the given resource.

    Args:
        df (pd.DataFrame): Dataframe to which the features will be added
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): Base string to the knowledge graph; 
            ignored when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        type_filter (str, optional): Property datatype to be selected from 
            results (e.g. xsd:string). If a specific datatype should be
            excluded a "- " needs to be prepended (e.g. - xsd:string). Defaults
            to None.
        regex_filter (str, optional): Regular expression for filtering 
            properties. Defaults to None.
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation! . Defaults to True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with a new column for each property.
    """

    df = df.copy()

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    # Prepare Type Filter Statement (Decode Include/Exclude)

    if type_filter != None:

        if type_filter[0:2] == "- ":
            type_filter_str = " && DATATYPE(?v) != " + type_filter[2:]

        else:
            type_filter_str = " && DATATYPE(?v) = " + type_filter

    # Create SPARQL query for each user-specified column

    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if bundled_mode and not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            query = "SELECT ?value ?p ?v WHERE {VALUES (?value) {" + \
                values + "} ?value ?p ?v FILTER(isLITERAL(?v)"

            if type_filter != None:

                query = query + type_filter_str

            if regex_filter != None:

                query = query + " && regex(?p, \"" + regex_filter + "\")"

            query = query + ")}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            result_df = pd.DataFrame()

            if uri_data_model:

                query = "SELECT DISTINCT ?value ?p ?v WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?v FILTER(isLITERAL(?v)"

                if type_filter != None:

                    query = query + type_filter_str

                if regex_filter != None:

                    query = query + " && regex(?p, \"" + regex_filter + "\")"

                query = query + ")}"

                result_df = uri_querier(df,
                                        col,
                                        query,
                                        prefix_lookup=prefix_lookup,
                                        progress=progress,
                                        caching=caching)

            else:
                for uri in df[col].iteritems():

                    if pd.notna(uri[1]):

                        query = "SELECT DISTINCT ?value ?p ?v WHERE {?value ?p ?v . FILTER (?value = <" + \
                            uri[1]+"> && (isLITERAL(?v))"

                        if type_filter != None:

                            query = query + type_filter_str

                        if regex_filter != None:

                            query = query + " && regex(?p, \"" + regex_filter + "\")"

                        query = query + ")} "

                        result = endpoint_wrapper(query,
                                                  endpoint,
                                                  prefix_lookup=prefix_lookup,
                                                  caching=caching)

                        result_df = result_df.append(result)

                    else:
                        pass

        if result_df.empty:

            pass

        else:

            # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is on row

            result_df["p"] = col + "_data_" + result_df["p"]

            # transform values into new columns

            result_df = result_df.pivot_table(values="v",
                                              index="value",
                                              columns="p",
                                              aggfunc=np.random.choice)

            # append properties to dataframe

            df = pd.merge(df,
                          result_df,
                          how="left",
                          left_on=col,
                          right_on="value")

    return df
Ejemplo n.º 16
0
def label_linker(
    df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, result_filter=None,
    language="en", max_hits=1, label_property="rdfs:label",prefix_lookup=False, caching=True):
    """Label Linker takes attributes from a column and adds a new column with
    the respective knowledge graph links based on the provided label_property
    (rdfs:label by default).

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        column (str): Name of the column whose entities should be found.
        new_attribute_name (str, optional): Name of column containing the link 
            to the knowledge graph. Defaults to "new_link".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        endpoint (Endpoint, optional): Choose SPARQL endpoint connection. 
            Defaults to DBpedia.
        result_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results. Defaults to None.
        language (str, optional): Restrict search to labels with a certain 
            language tag. Set to None if restriction is needed. Defaults to 
            "en".
        max_hits (int, optional): Maximal number of URI's that should be 
            returned per entity. Defaults to 1.
        label_property (str, optional): Specifies the label_property the should 
            be used in the query. Defaults to "rdfs:label".
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.
            
    Returns:
        pd.DataFrame: Dataframe with a new column containing the links to the
        knowledge graph.
    """

    df = df.copy()

    result_df = pd.DataFrame()

    if progress:
        iterator = tqdm(df[column].iteritems(), total=df.shape[0])
    else:
        iterator = df[column].iteritems()

    for col in iterator:

        if not pd.isnull(col[1]):
            query = "SELECT DISTINCT ?label ?uri WHERE { ?uri "+label_property+" ?label . filter"

            if language != None:

                query = query + "(?label =\"" + col[1] + "\"@" + language

            else:

                query = query + "(str(?label) =\"" + col[1] + "\""
                
            if result_filter != None:

                query = query + \
                        " && ("+regex_string_generator("?uri",
                                                        result_filter)+")"

            query = query + ")}"
            
            if max_hits:
                query = query + " LIMIT " + str(max_hits)

            result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching)
            result_df = result_df.append(result)

    result_df = result_df.reset_index(drop=True)

    if result_df.empty:

        df[new_attribute_name+"_1"] = np.nan

        return df

    else:

        result_df_grouped = result_df.groupby("label")["uri"].apply(
            lambda x: pd.Series(x.values)).unstack()
        result_df_grouped = result_df_grouped.rename(
            columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])})
        result_df_grouped = result_df_grouped.reset_index()

        df = pd.merge(df, result_df_grouped.drop_duplicates(), left_on=column,
                      right_on="label", how="outer").drop("label", axis=1)

    return df
Ejemplo n.º 17
0
def direct_type_generator(df,
                          columns,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          progress=True,
                          prefix="",
                          regex_filter=None,
                          result_type="boolean",
                          bundled_mode=True,
                          hierarchy=False,
                          prefix_lookup=False,
                          caching=True):
    """Generator that takes a dataset with (a) link(s) to a knowledge graph and
    queries the type(s) of the linked ressources (using rdf:type). The
    resulting types are added as new columns, which are filled either with a
    boolean indicator or a count.

    Args:
        df (pd.DataFrame): Dataframe to which types are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL . Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process . Defaults 
            to True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defaults to 
            "".
        regex_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results . Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") . Defaults to "boolean".
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation! . Defaults to True.
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the 
        found types.
    """

    df = df.copy()

    final_result_df = pd.DataFrame()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    # Create SPARQL query (based on rdf:type) for each user-specified column

    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for column in iterator:

        # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible.

        if bundled_mode and not uri_data_model:

            values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) "

            query = prefix + \
                " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \
                values+"} ?value rdf:type ?types . "

            if regex_filter != None:

                regex_string = regex_string_generator("?types", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            result_df = pd.DataFrame()

            if uri_data_model:

                query = prefix + \
                    " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . "

                if regex_filter != None:

                    regex_string = regex_string_generator(
                        "str(?types)", regex_filter)

                    query = query + "FILTER(" + regex_string + ") "

                query = query + "}"

                result_df = uri_querier(df,
                                        column,
                                        query,
                                        prefix_lookup=prefix_lookup,
                                        progress=progress,
                                        caching=caching)

            else:

                for uri in df[column].iteritems():

                    if pd.notna(uri[1]):

                        query = prefix + \
                            " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \
                            uri[1]+">"

                        if regex_filter != None:

                            query = query + " && (" + regex_string_generator(
                                "?types", regex_filter) + ")"

                        query = query + ") }"

                        result = endpoint_wrapper(query,
                                                  endpoint,
                                                  prefix_lookup=prefix_lookup,
                                                  caching=caching)

                        result_df = result_df.append(result)

                    else:
                        pass

            result_df = result_df.rename(
                {
                    "callret-0": "value"
                }, axis="columns").drop_duplicates().reset_index(drop=True)

        if hierarchy:
            hierarchy_col = hierarchy_graph_generator(
                result_df["types"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        if result_df.empty:

            result_columns = []
            pass

        else:

            # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row

            result_df_dummies = result_df.join(
                result_df.types.str.get_dummies()).drop("types", axis=1)

            # Sparse dataframe is grouped by uri

            result_df_grouped = result_df_dummies.groupby("value").sum()

            # Result columns get prefix (format depends on single or multiple columns)

            if len(columns) > 1:

                result_df_grouped = result_df_grouped.add_prefix("type_")

            else:

                result_df_grouped = result_df_grouped.add_prefix(column +
                                                                 "_type_")

            # Results get concatenated to the queried columns (to be used as identifiers) (??)

            result_df_merged = pd.merge(df[columns],
                                        result_df_grouped,
                                        left_on=column,
                                        right_on="value",
                                        how="outer").drop_duplicates()

            # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop

            final_result_df = pd.concat([final_result_df, result_df_merged],
                                        sort=False).groupby(
                                            columns,
                                            dropna=False).sum().reset_index()

            # Result columns are determined and converted to the correct dtype

            result_columns = list(
                set(list(final_result_df.columns)) - set(columns))

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("int64")

    if not final_result_df.empty:

        # If result_type is boolean, all values greater 0 are changed to True all others to False

        if result_type == "boolean":

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("bool")

        # If result_type is "relative" or "tfidf", calculate the relative counts per row

        elif result_type in ["relative", "tfidf"]:

            # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0)
            final_result_df_relative = final_result_df.copy()

            final_result_df_relative[result_columns] = final_result_df[
                result_columns].div(
                    final_result_df[result_columns].sum(axis=1),
                    axis=0).fillna(0)

            # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values

            if result_type == "tfidf":

                # Calculate idf values

                N = len(final_result_df[result_columns])

                nt = final_result_df[result_columns][
                    final_result_df[result_columns] >= 1].count(axis=0)

                idf = np.log(N / nt).replace(np.inf, 0)

                # Multiply relative counts with idf values

                final_result_df_relative[
                    result_columns] = final_result_df_relative[
                        result_columns].multiply(idf, axis="columns")

            final_result_df = final_result_df_relative.copy()

        # Collected query-results get appended to the original dataframe

        df = pd.merge(df, final_result_df, on=columns, how="outer")

    if hierarchy:
        df.attrs = {"hierarchy": hierarchyGraph}

    return df
Ejemplo n.º 18
0
def hierarchy_graph_generator(
        col,
        hierarchy_relation="http://www.w3.org/2000/01/rdf-schema#subClassOf",
        max_hierarchy_depth=None,
        endpoint=DBpedia,
        uri_data_model=False,
        progress=False,
        caching=True):
    """Computes a hierarchy graph from an original set of features, where 
    directed edges symbolise a hierarchy relation from subclass to superclass.

    Args:
        col (pd.Series): The classes/categories for which the hierarchy graph
            is generated.
        hierarchy_relation (str, optional): The hierarchy relation to be used.
            Defaults to "http://www.w3.org/2000/01/rdf-schema#subClassOf".
        max_hierarchy_depth (int, optional): Number of jumps in hierarchy. If 
            None, transitive jumps are used. Defaults to None.
        endpoint (Endpoint, optional): Link to the SPARQL endpoint that should
            be queried. Defaults to DBpedia.
        uri_data_model (bool, optional): whether to use sparql querier or the 
            uri data model. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off.

    Returns:
        nx.DirectedGraph: Graph where edges point to direct superclasses of
        nodes.
    """

    # warn if wrong configurations are used and correct them
    cond_subclass = hierarchy_relation ==\
         "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    if cond_subclass and max_hierarchy_depth:
        warnings.warn("""If you use subClass with a maximum hierarchy depth, 
        meaningless superclasses are generated. 
        Max_hierarchy_depth is set to None instead""")
        max_hierarchy_depth = None

    cond_broader= hierarchy_relation ==\
         "http://www.w3.org/2004/02/skos/core#broader"
    if cond_broader and max_hierarchy_depth is None:
        warnings.warn("""Transitive superclass generation does not work for
        categories. Max_hierarchy_depth is set to 1. For higher depths, set
        max_hierarchy_depth to a higher integer""")
        max_hierarchy_depth = 1

    # Initialise the graph
    DG = nx.DiGraph()
    # if column contains only missings return empty graph
    if col.isna().all():
        return DG
    current_level = col.copy()

    # in this case the query contains all future hierarchy levels and queries
    # them directly
    if max_hierarchy_depth and not uri_data_model:
        query = hierarchy_query_creator(col, hierarchy_relation,
                                        max_hierarchy_depth, uri_data_model)
        results = endpoint_wrapper(query,
                                   endpoint,
                                   return_XML=True,
                                   caching=caching)
        DG, _ = create_graph_from_raw(DG, results, max_hierarchy_depth, None,
                                      uri_data_model)

    # here the "broader" steps have to be added sequentially from level to
    # level until the max_hierarchy_depth is reached
    elif max_hierarchy_depth and uri_data_model:
        hierarchy_level = 0
        while not current_level.empty and hierarchy_level < max_hierarchy_depth:
            query = hierarchy_query_creator(current_level, hierarchy_relation,
                                            max_hierarchy_depth,
                                            uri_data_model)
            temp_frame = pd.DataFrame(current_level)
            results = uri_querier(temp_frame,
                                  current_level.name,
                                  query,
                                  progress=progress,
                                  caching=caching)

            current_level = list()
            DG, current_level = create_graph_from_raw(DG, results,
                                                      max_hierarchy_depth,
                                                      current_level,
                                                      uri_data_model)

            hierarchy_level += 1

    # iteratively loop from hierarchy level to hierarchy level until no
    # more superclasses are found --> transitive without maximum
    else:
        while not current_level.empty:
            query = hierarchy_query_creator(current_level, hierarchy_relation,
                                            max_hierarchy_depth,
                                            uri_data_model)
            if uri_data_model:
                temp_frame = pd.DataFrame(current_level)
                results = uri_querier(temp_frame,
                                      current_level.name,
                                      query,
                                      progress=progress,
                                      caching=caching)
            else:
                results = endpoint_wrapper(query,
                                           endpoint,
                                           return_XML=True,
                                           caching=caching)
            current_level = list()
            DG, current_level = create_graph_from_raw(DG, results,
                                                      max_hierarchy_depth,
                                                      current_level,
                                                      uri_data_model)

    # Find cycles and break them
    while not nx.is_directed_acyclic_graph(DG):
        try:
            cycle = nx.find_cycle(DG)
            backwards_path = cycle[1]
            DG.remove_edge(*backwards_path)
        except nx.NetworkXNoCycle:
            pass

    return DG
Ejemplo n.º 19
0
def check_uri_redirects(
        df,
        column,
        replace=True,
        custom_name_postfix=None,
        redirection_property="http://dbpedia.org/ontology/wikiPageRedirects",
        endpoint=DBpedia,
        regex_filter="dbpedia",
        bundled_mode=True,
        uri_data_model=False,
        progress=True,
        caching=True):
    """Takes a column of URIs from a DataFrame and checks for each if it has a 
    redirection set by the endpoint. If this is the case, the URI it redirects 
    to is either added in a new column or replaces the original URI.

    Args:
        df (pd.DataFrame): Dataframe for which the URIs should be inspected.
        column (str): Name of the column that contains the URIs that should be 
            checked.
        replace (bool, optional): If True: URIs that get redirected will be 
            replaced with the new URI; If False: A new column, containing the 
            result for each URI, is added to the DataFrame. Defaults to True.
        custom_name_postfix (str, optional): Custom postfix for the newly 
            created column (in case "replace" is set to False). Defaults to None.
        redirection_property (str, optional): Relation/Property URI that 
            signals a redirect for this endpoint. Defaults to 
            "http://dbpedia.org/ontology/wikiPageRedirects".
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        regex_filter (str, optional): Just URIs matching the specified RegEx 
            are checked for redirects. Defaults to "dbpedia".
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation!; ignored when "uri_data_model" = True. 
            Defaults to True.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Raises:
        ValueError: Raised if 'custom_name_postfix' is set to "" instead of 
            None.

    Returns:
        pd.DataFrame: Returns dataframe with cleaned links / a new column.
    """

    if custom_name_postfix == "":

        raise ValueError(
            "'custom_name_postfix' can't be an empty string. If you don't want to use a custom_name_postfix, please set the attribute to None"
        )

    df = df.copy()

    if bundled_mode and not uri_data_model:

        values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) "

        query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {" + values + "} ?value <" + redirection_property + "> ?redirect . }"

        result_df = endpoint_wrapper(
            query, endpoint,
            caching=caching).drop_duplicates().reset_index(drop=True)

    else:

        result_df = pd.DataFrame()

        if uri_data_model:

            query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {(<**URI**>)} ?value <" + redirection_property + "> ?redirect . }"

            result_df = uri_querier(df,
                                    column,
                                    query,
                                    regex_filter=regex_filter,
                                    progress=progress,
                                    caching=caching)

        else:

            for uri in df[column].iteritems():

                if pd.notna(uri[1]):

                    query = "SELECT DISTINCT ?value ?redirect WHERE {?value <" + redirection_property + "> ?redirect . FILTER (?value = <" + uri[
                        1] + ">) }"

                    result = endpoint_wrapper(query, endpoint, caching=caching)

                    result_df = result_df.append(result)

                else:
                    pass

        result_df = result_df.rename({
            "callret-0": "value"
        }, axis="columns").drop_duplicates().reset_index(drop=True)

    if result_df.empty:

        return df

    else:

        if custom_name_postfix == None:

            new_attribute_name = column + "_redirect"

        else:

            new_attribute_name = column + custom_name_postfix

        result_df = pd.merge(df,
                             result_df,
                             how="left",
                             left_on=column,
                             right_on="value").drop("value", axis=1).rename(
                                 columns={"redirect": new_attribute_name})

        if replace:

            result_df.loc[(pd.isnull(result_df[new_attribute_name])),
                          new_attribute_name] = result_df[column]
            result_df.drop(column, axis=1, inplace=True)
            result_df.rename(columns={new_attribute_name: column},
                             inplace=True)

    return result_df