Beispiel #1
0
    def test3_nocaching(self):

        # Not all attributes (here "adresse") are present for all of the URIs.

        input = pd.DataFrame({
            'uris': [
                'http://dbpedia.org/resource/Berlin',
                'http://dbpedia.org/resource/Darmstadt',
                'http://dbpedia.org/resource/London',
                'http://dbpedia.org/resource/Munich'
            ]
        })

        query = "SELECT DISTINCT ?uri ?adresse ?lat WHERE {VALUES (?uri) {(<**URI**>)} ?uri dbp:adresse ?adresse. ?uri geo:lat ?lat} ORDER BY ?adresse LIMIT 2"

        result = uri_querier(input, "uris", query, caching=False)

        expected_result_df = pd.DataFrame({
            'uri': [
                'http://dbpedia.org/resource/Darmstadt',
                'http://dbpedia.org/resource/Darmstadt',
                'http://dbpedia.org/resource/Munich',
                'http://dbpedia.org/resource/Munich'
            ],
            'adresse': ['Luisenplatz 5', '64283', 'Marienplatz 8', '80331'],
            'lat': [49.8667, 49.8667, 48.1333, 48.1333]
        })

        pd.testing.assert_frame_equal(result,
                                      expected_result_df,
                                      check_like=True)
Beispiel #2
0
    def test4_brokenuris(self):

        # Not all attributes (here "adresse") are present for all of the URIs.

        input = pd.DataFrame({
            'uris': [
                'https://www.dd', 'www.google.de', 'https://www.google.de',
                'http://dbpedia.org/resource/Munich', np.nan
            ]
        })

        query = "SELECT DISTINCT ?uri ?adresse ?lat WHERE {VALUES (?uri) {(<**URI**>)} ?uri dbp:adresse ?adresse. ?uri geo:lat ?lat} ORDER BY ?adresse LIMIT 2"

        with pytest.warns(UserWarning) as record:
            result = uri_querier(input, "uris", query, progress=True)

        assert len(record) == 3
        assert record[0].message.args[
            0] == "https://www.dd is not a valid URI."
        assert record[1].message.args[
            0] == "www.google.de might not be a valid URI."
        assert record[2].message.args[
            0] == "https://www.google.de might not be dereferencable."

        expected_result_df = pd.DataFrame({
            'uri': [
                'http://dbpedia.org/resource/Munich',
                'http://dbpedia.org/resource/Munich'
            ],
            'adresse': ['Marienplatz 8', '80331'],
            'lat': [48.1333, 48.1333]
        })

        pd.testing.assert_frame_equal(result,
                                      expected_result_df,
                                      check_like=True)
def hierarchy_graph_generator(
        col,
        hierarchy_relation="http://www.w3.org/2000/01/rdf-schema#subClassOf",
        max_hierarchy_depth=None,
        endpoint=DBpedia,
        uri_data_model=False,
        progress=False,
        caching=True):
    """Computes a hierarchy graph from an original set of features, where 
    directed edges symbolise a hierarchy relation from subclass to superclass.

    Args:
        col (pd.Series): The classes/categories for which the hierarchy graph
            is generated.
        hierarchy_relation (str, optional): The hierarchy relation to be used.
            Defaults to "http://www.w3.org/2000/01/rdf-schema#subClassOf".
        max_hierarchy_depth (int, optional): Number of jumps in hierarchy. If 
            None, transitive jumps are used. Defaults to None.
        endpoint (Endpoint, optional): Link to the SPARQL endpoint that should
            be queried. Defaults to DBpedia.
        uri_data_model (bool, optional): whether to use sparql querier or the 
            uri data model. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off.

    Returns:
        nx.DirectedGraph: Graph where edges point to direct superclasses of
        nodes.
    """

    # warn if wrong configurations are used and correct them
    cond_subclass = hierarchy_relation ==\
         "http://www.w3.org/2000/01/rdf-schema#subClassOf"
    if cond_subclass and max_hierarchy_depth:
        warnings.warn("""If you use subClass with a maximum hierarchy depth, 
        meaningless superclasses are generated. 
        Max_hierarchy_depth is set to None instead""")
        max_hierarchy_depth = None

    cond_broader= hierarchy_relation ==\
         "http://www.w3.org/2004/02/skos/core#broader"
    if cond_broader and max_hierarchy_depth is None:
        warnings.warn("""Transitive superclass generation does not work for
        categories. Max_hierarchy_depth is set to 1. For higher depths, set
        max_hierarchy_depth to a higher integer""")
        max_hierarchy_depth = 1

    # Initialise the graph
    DG = nx.DiGraph()
    # if column contains only missings return empty graph
    if col.isna().all():
        return DG
    current_level = col.copy()

    # in this case the query contains all future hierarchy levels and queries
    # them directly
    if max_hierarchy_depth and not uri_data_model:
        query = hierarchy_query_creator(col, hierarchy_relation,
                                        max_hierarchy_depth, uri_data_model)
        results = endpoint_wrapper(query,
                                   endpoint,
                                   return_XML=True,
                                   caching=caching)
        DG, _ = create_graph_from_raw(DG, results, max_hierarchy_depth, None,
                                      uri_data_model)

    # here the "broader" steps have to be added sequentially from level to
    # level until the max_hierarchy_depth is reached
    elif max_hierarchy_depth and uri_data_model:
        hierarchy_level = 0
        while not current_level.empty and hierarchy_level < max_hierarchy_depth:
            query = hierarchy_query_creator(current_level, hierarchy_relation,
                                            max_hierarchy_depth,
                                            uri_data_model)
            temp_frame = pd.DataFrame(current_level)
            results = uri_querier(temp_frame,
                                  current_level.name,
                                  query,
                                  progress=progress,
                                  caching=caching)

            current_level = list()
            DG, current_level = create_graph_from_raw(DG, results,
                                                      max_hierarchy_depth,
                                                      current_level,
                                                      uri_data_model)

            hierarchy_level += 1

    # iteratively loop from hierarchy level to hierarchy level until no
    # more superclasses are found --> transitive without maximum
    else:
        while not current_level.empty:
            query = hierarchy_query_creator(current_level, hierarchy_relation,
                                            max_hierarchy_depth,
                                            uri_data_model)
            if uri_data_model:
                temp_frame = pd.DataFrame(current_level)
                results = uri_querier(temp_frame,
                                      current_level.name,
                                      query,
                                      progress=progress,
                                      caching=caching)
            else:
                results = endpoint_wrapper(query,
                                           endpoint,
                                           return_XML=True,
                                           caching=caching)
            current_level = list()
            DG, current_level = create_graph_from_raw(DG, results,
                                                      max_hierarchy_depth,
                                                      current_level,
                                                      uri_data_model)

    # Find cycles and break them
    while not nx.is_directed_acyclic_graph(DG):
        try:
            cycle = nx.find_cycle(DG)
            backwards_path = cycle[1]
            DG.remove_edge(*backwards_path)
        except nx.NetworkXNoCycle:
            pass

    return DG
Beispiel #4
0
def label_schema_matching(df,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          to_lowercase=True,
                          remove_prefixes=True,
                          remove_punctuation=True,
                          prefix_threshold=1,
                          progress=True,
                          caching=True):
    """A schema matching method by checking for attribute -- rdfs:label between 
    links.

    Args:
        df (pd.DataFrame): The dataframe where matching attributes are supposed 
            to be found.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults 
            to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        to_lowercase (bool, optional): Converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): Removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): Removes punctuation from queried
            strings. Defaults to True.
        prefix_threshold (int, optional): The number of occurences after which 
            a prefix is considered "common". Defaults to 1.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with the overlapped label.
    """

    matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"])

    # Get URIs from the column names
    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # transform attributes to sparql values list form
    values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) "

    if uri_data_model:
        # Query these URIs for the label
        query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"
        labels = uri_querier(
            pd.DataFrame(cat_cols_stripped),
            0,
            query,
            progress=progress,
            caching=caching).drop_duplicates().set_index("value")

    else:

        query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \
            "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"

        # query the equivalent classes/properties
        labels = endpoint_wrapper(query, endpoint,
                                  caching=caching).reset_index(drop=True)

    if labels.empty:
        return matches

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)
    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary
    if labels.index.name == "value":
        labels.reset_index(inplace=True)

    labels_dict = labels.set_index("value").T.to_dict("list")

    #check if there are no matches
    tmp = set()
    for v in labels_dict.values():
        tmp.update(v)
    if len(labels_dict) == len(tmp):
        combinations = list(itertools.combinations(cat_cols_stripped, 2))
        combinations_sorted = [sorted(x) for x in combinations]

        matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])
        matches["same_label"] = 0

        return matches

    else:
        # Combine the uris that have the same labels into a DataFrame
        new_labels_dict = collections.defaultdict(list)
        for key, values in labels_dict.items():
            for i in values:
                new_labels_dict[i].append(key)

        df_labels = pd.DataFrame(list(new_labels_dict.values()),
                                 columns=["uri_1", "uri_2"])
        #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys()))
        df_labels.dropna(inplace=True)

        # restrict the order of uris in one row
        for _, row in df_labels.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 1
            }
            matches = matches.append(new_match, ignore_index=True)

        # Get back the uris that are not quired by rdfs:label and turn df into dict
        no_label = pd.DataFrame({
            "value":
            [x for x in cat_cols_stripped if x not in list(labels["value"])],
            "o":
            np.nan
        })
        labels = labels.append(no_label, ignore_index=True)

        full_labels_dict = labels.set_index("value").T.to_dict("list")

        # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame
        combinations = list(itertools.combinations(full_labels_dict.keys(), 2))
        combinations_sorted = [sorted(x) for x in combinations]

        result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

        # merged with the non_matched combinations and drop duplicates
        for _, row in result.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 0
            }
            matches = matches.append(new_match, ignore_index=True)

        matches.drop_duplicates(subset=["uri_1", "uri_2"],
                                inplace=True,
                                ignore_index=True)

        return matches
Beispiel #5
0
def sameas_linker(
    df, column, new_attribute_name="new_link", progress=True, endpoint=DBpedia, 
    result_filter=None, uri_data_model=False, bundled_mode=True, 
    prefix_lookup=False, caching=True):
    """Function that takes URIs from a column of a DataFrame and queries a
    given SPARQL endpoint for ressources which are connected to these URIs via
    owl:sameAs. Found ressources are added as new columns to the dataframe and
    the dataframe is returned.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        column (str): Name of the column for whose entities links should be
            found.
        new_attribute_name (str, optional): Name / prefix of the column(s)  
            containing the found links. Defaults to "new_link".
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        result_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results. Defaults to None.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        bundled_mode (bool, optional): If True, all necessary queries are   
            boundled into one querie (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation!. Defaults to True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the
        found ressources.
    """

    df = df.copy()

    if bundled_mode and not uri_data_model:

        values = " ( <"+df[column].str.cat(sep="> ) ( <")+"> ) "

        query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {" + \
            values+"} ?value owl:sameAs ?sameas_uris . "

        if result_filter != None:

            query = query + \
                "FILTER("+regex_string_generator("?sameas_uris", result_filter)+") "

        query = query+"}"

        result_df = endpoint_wrapper(
            query, endpoint, prefix_lookup=prefix_lookup, caching=caching).drop_duplicates()

    else:

        result_df = pd.DataFrame()

        if uri_data_model:

            query = " SELECT DISTINCT ?value ?sameas_uris WHERE {VALUES (?value) {(<**URI**>)} ?value owl:sameAs ?sameas_uris . "

            if result_filter != None:

                query = query + \
                    "FILTER("+regex_string_generator("str(?sameas_uris)",
                                                     result_filter)+") "

            query = query+"}"

            result_df = uri_querier(
                df, column, query, prefix_lookup=prefix_lookup, progress=progress, caching=caching)

        else:

            if progress:
                iterator = tqdm(df[column].iteritems(), total=df.shape[0])
            else:
                iterator = df[column].iteritems()

            for uri in iterator:

                if pd.isna(uri[1]):

                    pass

                else:

                    query = " SELECT DISTINCT ?value ?sameas_uris WHERE {?value owl:sameAs ?sameas_uris. FILTER (?value = <"+uri[
                        1]+">"

                    if result_filter != None:

                        query = query + \
                            " && ("+regex_string_generator("?sameas_uris",
                                                           result_filter)+")"

                    query = query+") }"

                    result = endpoint_wrapper(query, endpoint, prefix_lookup=prefix_lookup, caching=caching)

                    result_df = result_df.append(result)

        result_df = result_df.rename(
            {"callret-0": "value"}, axis="columns").drop_duplicates().reset_index(drop=True)

    if result_df.empty:

        df[new_attribute_name+"_1"] = np.nan

        return df

    else:

        result_df_grouped = result_df.groupby("value")

        result_df_grouped = result_df_grouped["sameas_uris"].apply(
            lambda x: pd.Series(x.values)).unstack()
        result_df_grouped = result_df_grouped.rename(
            columns={i: new_attribute_name+"_{}".format(i + 1) for i in range(result_df_grouped.shape[1])})

        df = pd.merge(df, result_df_grouped, left_on=column,
                      right_on="value", how="outer")

        return df
Beispiel #6
0
def string_similarity_matching(df,
                               predicate="rdfs:label",
                               to_lowercase=True,
                               remove_prefixes=True,
                               remove_punctuation=True,
                               similarity_metric="norm_levenshtein",
                               prefix_threshold=1,
                               n=2,
                               progress=True,
                               caching=True):
    """Calculates the string similarity from the text field obtained by
    querying the attributes for the predicate, by default rdfs:label.

    Args:
        df (pd.DataFrame): Dataframe where matching attributes are supposed to
            be found
        predicate (str, optional):  Defaults to "rdfs:label".
        to_lowercase (bool, optional): converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): removes punctuation from queried
            strings. Defaults to True.
        similarity_metric (str, optional): norm by which strings are compared.
            Defaults to "norm_levenshtein".
        prefix_threshold (int, optional): The number of occurences after which
            a prefix is considered "common". defaults to 1. n (int, optional):
            parameter for n-gram and Jaccard similarities. Defaults to 2.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with
        the string similarity score.
    """

    # Get URIs from the column names

    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # Query these URIs for the predicate (usually the label)

    query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value "
    query += predicate + " ?o. FILTER (lang(?o) = 'en') }"

    labels = uri_querier(pd.DataFrame(cat_cols_stripped),
                         0,
                         query,
                         progress=progress,
                         caching=caching).set_index("value")

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)

    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary that maps the URIs to their result (i.e. label)

    labels.reset_index(inplace=True)
    no_label = pd.DataFrame({
        "value":
        [x for x in cat_cols_stripped if x not in list(labels["value"])],
        "o":
        np.nan
    })
    labels = labels.append(no_label, ignore_index=True)
    labels_dict = labels.set_index("value").T.to_dict("list")
    #labels_dict = labels.to_dict(orient="index")

    # Create all unique combinations from the URIs, order them alphabetically
    # and turn them into a DataFrame

    combinations = list(itertools.combinations(labels_dict.keys(), 2))
    combinations_sorted = [sorted(x) for x in combinations]

    result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

    # For each combination in this DataFrame, calculate the string similarity
    # of their results (i.e. labels)

    if progress:
        tqdm.pandas(
            desc="String Similarity Matching: Calculate String Similarities")
        result["value_string"] = result.progress_apply(
            lambda x: calc_string_similarity(x["uri_1"],
                                             x["uri_2"],
                                             labels_dict,
                                             metric=similarity_metric,
                                             n=n),
            axis=1)
    else:
        result["value_string"] = result.apply(lambda x: calc_string_similarity(
            x["uri_1"], x["uri_2"
                          ], labels_dict, metric=similarity_metric, n=n),
                                              axis=1)

    return result
Beispiel #7
0
def relational_matching(df,
                        endpoints=[DBpedia, WikiData],
                        uri_data_model=False,
                        match_score=1,
                        progress=True,
                        caching=True):
    """Creates a mapping of matching attributes in the schema by checking for
    owl:sameAs, owl:equivalentClass, owl:Equivalent and wdt:P1628 links between 
    them.

    Args:
        df (pd.DataFrame): Dataframe where matching attributes are supposed to 
            be found.
        endpoints (list, optional): SPARQL Endpoint to be queried. Defaults to 
            [DBpedia, WikiData].
        uri_data_model (bool, optional): If enabled, the URI is directly queried
            instead of a SPARQL endpoint. Defaults to False.
        match_score (int, optional): Score of the match: 0 < match_score <= 1. 
            Defaults to 1.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with
        the score, which is always one in case of the relational matching
        unless specified otherwise.
        
    """

    matches = pd.DataFrame(columns=["uri_1", "uri_2", "value"])

    # determine attribute columns
    cat_cols = [col for col in df.columns if re.findall("http:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    if not cat_cols:
        return matches
    # transform attributes to sparql values list form
    values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) "

    if uri_data_model:
        # formulate query
        query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
        query += "SELECT ?value ?object WHERE {VALUES (?value) { (<**URI**>)}"
        query += " ?value\
             (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\
                  ?object. }"

        temp_df = pd.DataFrame(cat_cols_stripped, columns=["values"])
        same_cats = uri_querier(temp_df,
                                "values",
                                query,
                                caching=caching,
                                progress=progress)

        if same_cats.empty:
            return matches
        else:
            same_cats = same_cats.drop(
                same_cats[same_cats["value"] == same_cats["object"]].index)

    else:
        if not isinstance(endpoints, list):
            endpoints = [endpoints]

        same_cats = pd.DataFrame(columns=["value", "object"])

        for endpoint in endpoints:

            # formulate query
            query = "PREFIX wdt: <http://www.wikidata.org/prop/direct/>"
            query += "SELECT  ?value ?object WHERE {VALUES (?value) {"
            query += values
            query += "} ?value\
                 (owl:equivalentProperty|owl:equivalentClass|owl:sameAs|wdt:P1628)\
                      ?object. }"

            # query the equivalent classes/properties
            query_result = endpoint_wrapper(query, endpoint, caching=caching)
            if not query_result.empty:
                query_result = query_result.drop_duplicates().\
                    reset_index(drop=True)

            # group equivalent classes/properties for each original attribute
            same_cats = same_cats.append(query_result, ignore_index=True)

    if same_cats.empty:
        return matches

    combinations = list(itertools.combinations(cat_cols_stripped, 2))
    combinations_sorted = pd.DataFrame([sorted(x) for x in combinations],
                                       columns=["uri_1", "uri_2"])

    # detect matches in the attributes
    for _, row in same_cats.iterrows():
        if row["object"] in cat_cols_stripped:
            # if there is a match insert it in alphabetical order into the
            # output matches dataframe
            new_match = {
                "uri_1": min(row["value"], row["object"]),
                "uri_2": max(row["value"], row["object"]),
                "value": match_score
            }
            matches = matches.append(new_match, ignore_index=True)

    matches = matches.drop_duplicates()
    full_matches = combinations_sorted.merge(matches,
                                             on=["uri_1", "uri_2"],
                                             how="outer")
    full_matches["value"] = np.where(full_matches["value"].isna(), 0,
                                     full_matches["value"])

    return full_matches
Beispiel #8
0
def qualified_relation_generator(df,
                                 columns,
                                 endpoint=DBpedia,
                                 uri_data_model=False,
                                 progress=True,
                                 prefix="Link",
                                 direction="Out",
                                 properties_regex_filter=None,
                                 types_regex_filter=None,
                                 result_type="boolean",
                                 hierarchy=False,
                                 prefix_lookup=False,
                                 caching=True):
    """Qualified relation generator considers not only relations, but also the 
    related types, adding boolean, counts, relative counts or tfidf-values 
    features for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        properties_regex_filter (str, optional): Regular expression for 
            filtering properties. Defaults to None.
        types_regex_filter (str, optional): Regular expression for filtering 
            types. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of properties to the knowledge graph
    """

    df = df.copy()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {" + values + "} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {" + values + "} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("?p",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("?type",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT ?value ?p ?o ?type WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?o. ?o rdf:type ?type. "

            elif direction == "In":

                query = "SELECT ?value ?p ?s ?type WHERE {VALUES (?value) {(<**URI**>)} ?s ?p ?value. ?s rdf:type ?type. "

            if properties_regex_filter != None:

                regex_string = regex_string_generator("str(?p)",
                                                      properties_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            if types_regex_filter != None:

                regex_string = regex_string_generator("str(?type)",
                                                      types_regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:
        if hierarchy:

            hierarchy_col = hierarchy_graph_generator(
                result_df["type"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        result_df[
            "link_with_type"] = result_df["p"] + "_type_" + result_df["type"]

        result_df = result_df[["value", "link_with_type"]]

        result_df_dummies = result_df.join(
            result_df["link_with_type"].str.get_dummies()).drop(
                "link_with_type", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    if hierarchy:
        # append hierarchy to df as attribute, this will generate a warning but works
        result_df.attrs = {"hierarchy": hierarchyGraph}

    return result_df
Beispiel #9
0
def specific_relation_generator(
        df,
        columns,
        endpoint=DBpedia,
        uri_data_model=False,
        progress=True,
        direct_relation="http://purl.org/dc/terms/subject",
        hierarchy_relation=None,
        max_hierarchy_depth=1,
        prefix_lookup=False,
        caching=True):
    """Creates attributes from a specific direct relation. Additionally, it is
    possible to append a hierarchy with a user-defined hierarchy relation.

    Args:
        df (pd.DataFrame): the dataframe to extend
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly queried
            instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to
            inform the user about the progress made by the process. Defaults 
            to True.
        direct_relation (str, optional): Direct relation used to create
            features. Defaults to "http://purl.org/dc/terms/subject".
        hierarchy_relation (str, optional): Hierarchy relation used to connect 
            categories, e.g. http://www.w3.org/2004/02/skos/core#broader. 
            Defaults to None.
        max_hierarchy_depth (int, optional): Maximal number of hierarchy steps
            taken. Defaults to 1.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: The dataframe with additional features.
    """

    df = df.copy()

    if hierarchy_relation:
        hierarchy_relation = re.sub(r"^.*?https://", "http://",
                                    hierarchy_relation)
        hierarchy = nx.DiGraph()

    direct_relation = re.sub(r"^.*?https://", "http://", direct_relation)

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    if df[columns].isna().all().item():
        return df

    #  iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:
            # Create Sparql Query
            values = "(<" + df[col].str.cat(sep=">) (<") + ">) "
            query = "SELECT  ?value ?object "
            query += " WHERE {VALUES (?value) {" + values
            query += "} ?value (<" + direct_relation + ">) ?object. }"

            # Retrieve query results from endpoint
            query_result = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup, caching=caching).\
                    drop_duplicates().reset_index(drop=True)
        else:
            # Create URI Query
            query = "SELECT ?value ?object WHERE {VALUES (?value) {(<**URI**>)}"
            query += " ?value (<" + direct_relation + ">) ?object. }"

            query_result = uri_querier(df,
                                       col,
                                       query,
                                       prefix_lookup=prefix_lookup,
                                       progress=progress,
                                       caching=caching)

        # delete empty columns (for example when hierarchy relation returns
        # nothing)
        query_result = query_result.dropna(how="all", axis=1)

        # check if there are valid results, if not return the original frame
        if query_result.empty:
            continue

        # extract hierarchy
        if hierarchy_relation:
            hierarchy_col = hierarchy_graph_generator(
                query_result["object"],
                hierarchy_relation=hierarchy_relation,
                max_hierarchy_depth=max_hierarchy_depth,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)
            hierarchy = nx.compose(hierarchy, hierarchy_col)

        query_grouped = query_result.groupby("value")["object"].apply(list)

        # bundle the unique new features
        new_cols = pd.Series(query_grouped.values.sum()).unique()

        # create shape of result dataframe to fill
        df_to_append = pd.DataFrame(columns=new_cols)
        df_to_append["value"] = query_grouped.index

        # check for each URI if it belongs to the category and tick True/False
        for row, new_col in itertools.product(df_to_append.index, new_cols):
            df_to_append.loc[row, new_col] = np.where(
                new_col in query_grouped[df_to_append.loc[row, "value"]], True,
                False).item()

        # merge the new column with the original dataframe
        df_to_append.rename({"value": col}, axis=1, inplace=True)
        df = pd.merge(df, df_to_append, how="left", on=col)

        # rename columns
        if new_cols.any():
            df.columns = [
                col + "_in_boolean_" + name if name in new_cols else name
                for name in df.columns
            ]

    # append hierarchy to df as attribute, this will generate a warning but
    # works
    if hierarchy_relation:
        df.attrs = {"hierarchy": hierarchy}

    return df
Beispiel #10
0
def unqualified_relation_generator(df,
                                   columns,
                                   endpoint=DBpedia,
                                   uri_data_model=False,
                                   progress=True,
                                   prefix="Link",
                                   direction="Out",
                                   regex_filter=None,
                                   result_type="boolean",
                                   prefix_lookup=False,
                                   caching=True):
    """Unqualified relation generator creates attributes from the existence of 
    relations and adds boolean, counts, relative counts or tfidf-values features
    for incoming and outgoing relations.

    Args:
        df (pd.DataFrame): Dataframe to which links are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defauls to 
            "Link".
        direction (str, optional): The direction for properties which choose 
            from Incoming, Outgoing (In and Out). Defaults to "Out".
        regex_filter (str, optional): Regular expression for filtering 
            properties. Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") Defaults to "boolean".
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with new columns containing the links of 
        properties to the knowledge graph
    """

    df = df.copy()

    #convert columns to list to enable iteration
    if not isinstance(columns, list):

        columns = [columns]

    #iterate over possibly several link columns
    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) {" + values + "} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) {" + values + "} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("?p", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            if direction == "Out":

                query = "SELECT DISTINCT ?value ?p ?o WHERE {VALUES (?value) { (<**URI**>)} ?value ?p ?o "

            elif direction == "In":

                query = "SELECT DISTINCT ?value ?p ?s WHERE {VALUES (?value) { (<**URI**>)} ?s ?p ?value "

            if regex_filter != None:

                regex_string = regex_string_generator("str(?p)", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = uri_querier(df,
                                    col,
                                    query,
                                    prefix_lookup=prefix_lookup,
                                    progress=progress,
                                    caching=caching)

    if type(result_df) != type(pd.DataFrame()):

        pass

    if result_df.empty:

        pass

    else:

        result_df_dummies = result_df.join(
            result_df["p"].str.get_dummies()).drop("p", axis=1)

        result_df = get_result_df(
            result_df_dummies, result_type,
            prefix + "_" + direction + "_" + result_type + "_", df, columns)

    return result_df
Beispiel #11
0
def direct_type_generator(df,
                          columns,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          progress=True,
                          prefix="",
                          regex_filter=None,
                          result_type="boolean",
                          bundled_mode=True,
                          hierarchy=False,
                          prefix_lookup=False,
                          caching=True):
    """Generator that takes a dataset with (a) link(s) to a knowledge graph and
    queries the type(s) of the linked ressources (using rdf:type). The
    resulting types are added as new columns, which are filled either with a
    boolean indicator or a count.

    Args:
        df (pd.DataFrame): Dataframe to which types are added.
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL . Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process . Defaults 
            to True.
        prefix (str, optional): Custom prefix for the SPARQL query. Defaults to 
            "".
        regex_filter (list, optional): A list filled with regexes (as strings) 
            to filter the results . Defaults to None.
        result_type (str, optional): States wether the results should be 
            boolean ("boolean"), counts ("counts"), relative counts 
            ("relative") or tfidf-values ("tfidf") . Defaults to "boolean".
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation! . Defaults to True.
        hierarchy (bool, optional): If True, a hierarchy of all superclasses of 
            the returned types is attached to the resulting dataframe. Defaults 
            to False.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Returns dataframe with (a) new column(s) containing the 
        found types.
    """

    df = df.copy()

    final_result_df = pd.DataFrame()

    if hierarchy:
        hierarchyGraph = nx.DiGraph()

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    # Create SPARQL query (based on rdf:type) for each user-specified column

    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for column in iterator:

        # If bundled_mode is selected all necessary queries for a column are bundled into one query (using the VALUES method). -> Way faster But less compatible.

        if bundled_mode and not uri_data_model:

            values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) "

            query = prefix + \
                " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {" + \
                values+"} ?value rdf:type ?types . "

            if regex_filter != None:

                regex_string = regex_string_generator("?types", regex_filter)

                query = query + "FILTER(" + regex_string + ") "

            query = query + "}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            result_df = pd.DataFrame()

            if uri_data_model:

                query = prefix + \
                    " SELECT DISTINCT ?value ?types WHERE {VALUES (?value) {(<**URI**>)} ?value rdf:type ?types . "

                if regex_filter != None:

                    regex_string = regex_string_generator(
                        "str(?types)", regex_filter)

                    query = query + "FILTER(" + regex_string + ") "

                query = query + "}"

                result_df = uri_querier(df,
                                        column,
                                        query,
                                        prefix_lookup=prefix_lookup,
                                        progress=progress,
                                        caching=caching)

            else:

                for uri in df[column].iteritems():

                    if pd.notna(uri[1]):

                        query = prefix + \
                            " SELECT DISTINCT ?value ?types WHERE {?value rdf:type ?types . FILTER (?value = <" + \
                            uri[1]+">"

                        if regex_filter != None:

                            query = query + " && (" + regex_string_generator(
                                "?types", regex_filter) + ")"

                        query = query + ") }"

                        result = endpoint_wrapper(query,
                                                  endpoint,
                                                  prefix_lookup=prefix_lookup,
                                                  caching=caching)

                        result_df = result_df.append(result)

                    else:
                        pass

            result_df = result_df.rename(
                {
                    "callret-0": "value"
                }, axis="columns").drop_duplicates().reset_index(drop=True)

        if hierarchy:
            hierarchy_col = hierarchy_graph_generator(
                result_df["types"],
                hierarchy_relation=
                "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                max_hierarchy_depth=None,
                endpoint=endpoint,
                uri_data_model=uri_data_model,
                progress=progress,
                caching=caching)

            hierarchyGraph = nx.compose(hierarchyGraph, hierarchy_col)

        if result_df.empty:

            result_columns = []
            pass

        else:

            # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is one row

            result_df_dummies = result_df.join(
                result_df.types.str.get_dummies()).drop("types", axis=1)

            # Sparse dataframe is grouped by uri

            result_df_grouped = result_df_dummies.groupby("value").sum()

            # Result columns get prefix (format depends on single or multiple columns)

            if len(columns) > 1:

                result_df_grouped = result_df_grouped.add_prefix("type_")

            else:

                result_df_grouped = result_df_grouped.add_prefix(column +
                                                                 "_type_")

            # Results get concatenated to the queried columns (to be used as identifiers) (??)

            result_df_merged = pd.merge(df[columns],
                                        result_df_grouped,
                                        left_on=column,
                                        right_on="value",
                                        how="outer").drop_duplicates()

            # If multiple columns with URIs are looked up: Current results are merged with the results of previous passes of the loop

            final_result_df = pd.concat([final_result_df, result_df_merged],
                                        sort=False).groupby(
                                            columns,
                                            dropna=False).sum().reset_index()

            # Result columns are determined and converted to the correct dtype

            result_columns = list(
                set(list(final_result_df.columns)) - set(columns))

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("int64")

    if not final_result_df.empty:

        # If result_type is boolean, all values greater 0 are changed to True all others to False

        if result_type == "boolean":

            final_result_df[result_columns] = final_result_df[
                result_columns].astype("bool")

        # If result_type is "relative" or "tfidf", calculate the relative counts per row

        elif result_type in ["relative", "tfidf"]:

            # Calculate the relative counts by dividing each row by its sum, fillna(0) to replace missings created by division by zero (when sum=0)
            final_result_df_relative = final_result_df.copy()

            final_result_df_relative[result_columns] = final_result_df[
                result_columns].div(
                    final_result_df[result_columns].sum(axis=1),
                    axis=0).fillna(0)

            # If result_type is "tfidf", use the table of relative counts to create the table of tfidf-values

            if result_type == "tfidf":

                # Calculate idf values

                N = len(final_result_df[result_columns])

                nt = final_result_df[result_columns][
                    final_result_df[result_columns] >= 1].count(axis=0)

                idf = np.log(N / nt).replace(np.inf, 0)

                # Multiply relative counts with idf values

                final_result_df_relative[
                    result_columns] = final_result_df_relative[
                        result_columns].multiply(idf, axis="columns")

            final_result_df = final_result_df_relative.copy()

        # Collected query-results get appended to the original dataframe

        df = pd.merge(df, final_result_df, on=columns, how="outer")

    if hierarchy:
        df.attrs = {"hierarchy": hierarchyGraph}

    return df
Beispiel #12
0
def data_properties_generator(df,
                              columns,
                              endpoint=DBpedia,
                              uri_data_model=False,
                              progress=True,
                              type_filter=None,
                              regex_filter=None,
                              bundled_mode=True,
                              prefix_lookup=False,
                              caching=True):
    """Generator that takes a dataset with a link to a knowledge graph and 
    creates a new feature for each data property of the given resource.

    Args:
        df (pd.DataFrame): Dataframe to which the features will be added
        columns (str/list): Name(s) of column(s) which contain(s) the link(s) 
            to the knowledge graph.
        endpoint (Endpoint, optional): Base string to the knowledge graph; 
            ignored when "uri_data_model" = True. Defaults to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        type_filter (str, optional): Property datatype to be selected from 
            results (e.g. xsd:string). If a specific datatype should be
            excluded a "- " needs to be prepended (e.g. - xsd:string). Defaults
            to None.
        regex_filter (str, optional): Regular expression for filtering 
            properties. Defaults to None.
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation! . Defaults to True.
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Dataframe with a new column for each property.
    """

    df = df.copy()

    # convert columns to list to enable iteration
    if not isinstance(columns, list):
        columns = [columns]

    # Prepare Type Filter Statement (Decode Include/Exclude)

    if type_filter != None:

        if type_filter[0:2] == "- ":
            type_filter_str = " && DATATYPE(?v) != " + type_filter[2:]

        else:
            type_filter_str = " && DATATYPE(?v) = " + type_filter

    # Create SPARQL query for each user-specified column

    if progress:
        iterator = tqdm(columns, desc="Column")
    else:
        iterator = columns

    for col in iterator:

        if bundled_mode and not uri_data_model:

            values = " ( <" + df[col].str.cat(sep="> ) ( <") + "> ) "

            query = "SELECT ?value ?p ?v WHERE {VALUES (?value) {" + \
                values + "} ?value ?p ?v FILTER(isLITERAL(?v)"

            if type_filter != None:

                query = query + type_filter_str

            if regex_filter != None:

                query = query + " && regex(?p, \"" + regex_filter + "\")"

            query = query + ")}"

            result_df = endpoint_wrapper(
                query, endpoint, prefix_lookup=prefix_lookup,
                caching=caching).drop_duplicates().reset_index(drop=True)

        else:

            result_df = pd.DataFrame()

            if uri_data_model:

                query = "SELECT DISTINCT ?value ?p ?v WHERE {VALUES (?value) {(<**URI**>)} ?value ?p ?v FILTER(isLITERAL(?v)"

                if type_filter != None:

                    query = query + type_filter_str

                if regex_filter != None:

                    query = query + " && regex(?p, \"" + regex_filter + "\")"

                query = query + ")}"

                result_df = uri_querier(df,
                                        col,
                                        query,
                                        prefix_lookup=prefix_lookup,
                                        progress=progress,
                                        caching=caching)

            else:
                for uri in df[col].iteritems():

                    if pd.notna(uri[1]):

                        query = "SELECT DISTINCT ?value ?p ?v WHERE {?value ?p ?v . FILTER (?value = <" + \
                            uri[1]+"> && (isLITERAL(?v))"

                        if type_filter != None:

                            query = query + type_filter_str

                        if regex_filter != None:

                            query = query + " && regex(?p, \"" + regex_filter + "\")"

                        query = query + ")} "

                        result = endpoint_wrapper(query,
                                                  endpoint,
                                                  prefix_lookup=prefix_lookup,
                                                  caching=caching)

                        result_df = result_df.append(result)

                    else:
                        pass

        if result_df.empty:

            pass

        else:

            # Results are transformed to a sparse dataframe (rows: looked-up uris; columns: types) with dummy-encoding (0/1) -> Each result is on row

            result_df["p"] = col + "_data_" + result_df["p"]

            # transform values into new columns

            result_df = result_df.pivot_table(values="v",
                                              index="value",
                                              columns="p",
                                              aggfunc=np.random.choice)

            # append properties to dataframe

            df = pd.merge(df,
                          result_df,
                          how="left",
                          left_on=col,
                          right_on="value")

    return df
Beispiel #13
0
def check_uri_redirects(
        df,
        column,
        replace=True,
        custom_name_postfix=None,
        redirection_property="http://dbpedia.org/ontology/wikiPageRedirects",
        endpoint=DBpedia,
        regex_filter="dbpedia",
        bundled_mode=True,
        uri_data_model=False,
        progress=True,
        caching=True):
    """Takes a column of URIs from a DataFrame and checks for each if it has a 
    redirection set by the endpoint. If this is the case, the URI it redirects 
    to is either added in a new column or replaces the original URI.

    Args:
        df (pd.DataFrame): Dataframe for which the URIs should be inspected.
        column (str): Name of the column that contains the URIs that should be 
            checked.
        replace (bool, optional): If True: URIs that get redirected will be 
            replaced with the new URI; If False: A new column, containing the 
            result for each URI, is added to the DataFrame. Defaults to True.
        custom_name_postfix (str, optional): Custom postfix for the newly 
            created column (in case "replace" is set to False). Defaults to None.
        redirection_property (str, optional): Relation/Property URI that 
            signals a redirect for this endpoint. Defaults to 
            "http://dbpedia.org/ontology/wikiPageRedirects".
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried; ignored 
            when "uri_data_model" = True. Defaults to DBpedia.
        regex_filter (str, optional): Just URIs matching the specified RegEx 
            are checked for redirects. Defaults to "dbpedia".
        bundled_mode (bool, optional): If True, all necessary queries are 
            bundled into one query (using the VALUES method). - Requires a 
            SPARQL 1.1 implementation!; ignored when "uri_data_model" = True. 
            Defaults to True.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Raises:
        ValueError: Raised if 'custom_name_postfix' is set to "" instead of 
            None.

    Returns:
        pd.DataFrame: Returns dataframe with cleaned links / a new column.
    """

    if custom_name_postfix == "":

        raise ValueError(
            "'custom_name_postfix' can't be an empty string. If you don't want to use a custom_name_postfix, please set the attribute to None"
        )

    df = df.copy()

    if bundled_mode and not uri_data_model:

        values = " ( <" + df[column].str.cat(sep="> ) ( <") + "> ) "

        query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {" + values + "} ?value <" + redirection_property + "> ?redirect . }"

        result_df = endpoint_wrapper(
            query, endpoint,
            caching=caching).drop_duplicates().reset_index(drop=True)

    else:

        result_df = pd.DataFrame()

        if uri_data_model:

            query = "SELECT DISTINCT ?value ?redirect WHERE {VALUES (?value) {(<**URI**>)} ?value <" + redirection_property + "> ?redirect . }"

            result_df = uri_querier(df,
                                    column,
                                    query,
                                    regex_filter=regex_filter,
                                    progress=progress,
                                    caching=caching)

        else:

            for uri in df[column].iteritems():

                if pd.notna(uri[1]):

                    query = "SELECT DISTINCT ?value ?redirect WHERE {?value <" + redirection_property + "> ?redirect . FILTER (?value = <" + uri[
                        1] + ">) }"

                    result = endpoint_wrapper(query, endpoint, caching=caching)

                    result_df = result_df.append(result)

                else:
                    pass

        result_df = result_df.rename({
            "callret-0": "value"
        }, axis="columns").drop_duplicates().reset_index(drop=True)

    if result_df.empty:

        return df

    else:

        if custom_name_postfix == None:

            new_attribute_name = column + "_redirect"

        else:

            new_attribute_name = column + custom_name_postfix

        result_df = pd.merge(df,
                             result_df,
                             how="left",
                             left_on=column,
                             right_on="value").drop("value", axis=1).rename(
                                 columns={"redirect": new_attribute_name})

        if replace:

            result_df.loc[(pd.isnull(result_df[new_attribute_name])),
                          new_attribute_name] = result_df[column]
            result_df.drop(column, axis=1, inplace=True)
            result_df.rename(columns={new_attribute_name: column},
                             inplace=True)

    return result_df
Beispiel #14
0
def link_explorer(df, base_link_column, number_of_hops = 1, links_to_follow = ["owl:sameAs"], lod_sources = [], exclude_sources = [], prefix_lookup=False, progress = True, caching=True):
    """Follows the defined links starting from a base link to a certain number 
    of hops. Adds the discovered links as new columns to the dataframe.

    Args:
        df (pd.DataFrame): Dataframe with a base link
        base_link_column (str): Name of column which contains the base link to  
            start with. 
        number_of_hops (int, optional): Depth of exlporation of the LOD cloud. 
            Defaults to 1.
        links_to_follow (list, optional): Names of links that should be 
            followed. Defaults to "owl:sameAs".
        lod_sources (list, optional): Restrict exploration to certain datasets. 
            Use strings or regular expressions to define the allowed datasets. 
            Defaults to [].
        exclude_sources (list, optional): Exclude certain datasets from 
            exploration. Use strings or regular expressions to define the 
            datasets. Defaults to [].
        prefix_lookup (bool/str/dict, optional):
                        True: Namespaces of prefixes will be looked up at 
                        prefix.cc and added to the sparql query.
                        str: User provides the path to a json-file with 
                        prefixes and namespaces.
                        dict: User provides a dictionary with prefixes and 
                        namespaces.
                        Defaults to False.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.
            
    Returns:
        pd.DataFrame: Dataframe with a new column for each discovered link.
    """
    
    if not isinstance(links_to_follow, list):
        links_to_follow = [links_to_follow]

    if not isinstance(exclude_sources, list):
        exclude_sources = [exclude_sources]

    if not isinstance(lod_sources, list):
        lod_sources = [lod_sources]
        
    all_links = list(df[base_link_column])

    query_raw = " SELECT DISTINCT ?value ?uri{} WHERE {{VALUES (?value) {{(<**URI**>)}} ?value " + "|".join(links_to_follow) + " ?uri{} }} "
    
    df_merged = pd.DataFrame()
    df_all = pd.DataFrame()

    if progress:
        iterator = tqdm(
            range(1,number_of_hops+1), desc="Link Explorer - Performing Hops.")
    else:
        iterator = range(1,number_of_hops+1)
    
    for hop in iterator: 

        query = query_raw.format(str(hop),str(hop))
        
        if hop == 1:
            df_result = uri_querier(df, base_link_column, query, prefix_lookup=prefix_lookup, caching=caching, progress=progress)
        else:
            df_result = uri_querier(df_result, "uri"+str(hop-1), query, prefix_lookup=prefix_lookup, caching=caching, progress=progress)

        if df_result.empty:
            break
        
        # eliminate duplicate links
        df_result = df_result[~df_result["uri"+str(hop)].isin(all_links)]

        # filter sources               
        if lod_sources:
            df_result = df_result[df_result["uri"+str(hop)].str.contains("|".join(lod_sources))]

        # exclude certain sources defined by string or regex  
        if exclude_sources:
            df_result = df_result[~df_result["uri"+str(hop)].str.contains("|".join(exclude_sources))]

        if df_result.empty:
            break

        if df_merged.empty:
            df_merged = df_result
        else:
            df_merged = pd.merge(df_merged, df_result, left_on="uri"+str(hop-1), right_on="value", how="left", suffixes=("", "_y")).drop("value_y",axis=1)    

        df_all = df_all.append(df_merged[["value","uri"+str(hop)]].rename(columns={"uri"+str(hop) : "uri"}))
        df_all = df_all.dropna().drop_duplicates()
        
        all_links += df_result["uri"+str(hop)].tolist()

    if df_all.empty:
        return df
            
    df_all["count"] = np.nan

    regex_pattern = "^http:/"

    while True:

        regex_pattern += "/[^/]*"

        df_all["pld"] = df_all.apply(
            lambda x: x["pld"] if x["count"] == 1 else re.search(r"{}".format(regex_pattern), x["uri"]).group(), axis=1)

        df_all = df_all.drop("count", axis=1)

        df_with_counts = df_all.groupby(["value","pld"]).size().reset_index(name="count")

        df_all = pd.merge(df_all, df_with_counts, left_on=["value","pld"], right_on=["value","pld"])

        #break loop when all counts are 1
        if (df_all["count"] == 1).all():
            break

    df_pivot = df_all.pivot_table(values="uri", index="value", columns="pld", aggfunc="first").reset_index()
    
    df_final = pd.merge(df, df_pivot, left_on=base_link_column, right_on="value", how="outer").drop("value",axis=1)    
    
    return df_final