def test1_thres1(self):

        input_df = pd.read_csv(
            "test/data/schema_matching_helper/common_prefixes.csv")

        output_expected = ["p1", "p2", "p3", "pA", "pB"]

        output = get_common_prefixes(input_df,
                                     threshold=1,
                                     column_name="label")

        assert output == output_expected
    def test4_thres100(self):

        input_df = pd.read_csv(
            "test/data/schema_matching_helper/common_prefixes.csv")

        output_expected = []

        output = get_common_prefixes(input_df,
                                     threshold=100,
                                     column_name="label")

        assert output == output_expected
Esempio n. 3
0
def label_schema_matching(df,
                          endpoint=DBpedia,
                          uri_data_model=False,
                          to_lowercase=True,
                          remove_prefixes=True,
                          remove_punctuation=True,
                          prefix_threshold=1,
                          progress=True,
                          caching=True):
    """A schema matching method by checking for attribute -- rdfs:label between 
    links.

    Args:
        df (pd.DataFrame): The dataframe where matching attributes are supposed 
            to be found.
        endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults 
            to DBpedia.
        uri_data_model (bool, optional): If enabled, the URI is directly 
            queried instead of a SPARQL endpoint. Defaults to False.
        to_lowercase (bool, optional): Converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): Removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): Removes punctuation from queried
            strings. Defaults to True.
        prefix_threshold (int, optional): The number of occurences after which 
            a prefix is considered "common". Defaults to 1.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process (if 
            "uri_data_model" = True). Defaults to True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with the overlapped label.
    """

    matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"])

    # Get URIs from the column names
    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # transform attributes to sparql values list form
    values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) "

    if uri_data_model:
        # Query these URIs for the label
        query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"
        labels = uri_querier(
            pd.DataFrame(cat_cols_stripped),
            0,
            query,
            progress=progress,
            caching=caching).drop_duplicates().set_index("value")

    else:

        query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \
            "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }"

        # query the equivalent classes/properties
        labels = endpoint_wrapper(query, endpoint,
                                  caching=caching).reset_index(drop=True)

    if labels.empty:
        return matches

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)
    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary
    if labels.index.name == "value":
        labels.reset_index(inplace=True)

    labels_dict = labels.set_index("value").T.to_dict("list")

    #check if there are no matches
    tmp = set()
    for v in labels_dict.values():
        tmp.update(v)
    if len(labels_dict) == len(tmp):
        combinations = list(itertools.combinations(cat_cols_stripped, 2))
        combinations_sorted = [sorted(x) for x in combinations]

        matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])
        matches["same_label"] = 0

        return matches

    else:
        # Combine the uris that have the same labels into a DataFrame
        new_labels_dict = collections.defaultdict(list)
        for key, values in labels_dict.items():
            for i in values:
                new_labels_dict[i].append(key)

        df_labels = pd.DataFrame(list(new_labels_dict.values()),
                                 columns=["uri_1", "uri_2"])
        #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys()))
        df_labels.dropna(inplace=True)

        # restrict the order of uris in one row
        for _, row in df_labels.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 1
            }
            matches = matches.append(new_match, ignore_index=True)

        # Get back the uris that are not quired by rdfs:label and turn df into dict
        no_label = pd.DataFrame({
            "value":
            [x for x in cat_cols_stripped if x not in list(labels["value"])],
            "o":
            np.nan
        })
        labels = labels.append(no_label, ignore_index=True)

        full_labels_dict = labels.set_index("value").T.to_dict("list")

        # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame
        combinations = list(itertools.combinations(full_labels_dict.keys(), 2))
        combinations_sorted = [sorted(x) for x in combinations]

        result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

        # merged with the non_matched combinations and drop duplicates
        for _, row in result.iterrows():
            new_match = {
                "uri_1": min(row["uri_1"], row["uri_2"]),
                "uri_2": max(row["uri_1"], row["uri_2"]),
                "same_label": 0
            }
            matches = matches.append(new_match, ignore_index=True)

        matches.drop_duplicates(subset=["uri_1", "uri_2"],
                                inplace=True,
                                ignore_index=True)

        return matches
Esempio n. 4
0
def string_similarity_matching(df,
                               predicate="rdfs:label",
                               to_lowercase=True,
                               remove_prefixes=True,
                               remove_punctuation=True,
                               similarity_metric="norm_levenshtein",
                               prefix_threshold=1,
                               n=2,
                               progress=True,
                               caching=True):
    """Calculates the string similarity from the text field obtained by
    querying the attributes for the predicate, by default rdfs:label.

    Args:
        df (pd.DataFrame): Dataframe where matching attributes are supposed to
            be found
        predicate (str, optional):  Defaults to "rdfs:label".
        to_lowercase (bool, optional): converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): removes punctuation from queried
            strings. Defaults to True.
        similarity_metric (str, optional): norm by which strings are compared.
            Defaults to "norm_levenshtein".
        prefix_threshold (int, optional): The number of occurences after which
            a prefix is considered "common". defaults to 1. n (int, optional):
            parameter for n-gram and Jaccard similarities. Defaults to 2.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with
        the string similarity score.
    """

    # Get URIs from the column names

    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # Query these URIs for the predicate (usually the label)

    query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value "
    query += predicate + " ?o. FILTER (lang(?o) = 'en') }"

    labels = uri_querier(pd.DataFrame(cat_cols_stripped),
                         0,
                         query,
                         progress=progress,
                         caching=caching).set_index("value")

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)

    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary that maps the URIs to their result (i.e. label)

    labels.reset_index(inplace=True)
    no_label = pd.DataFrame({
        "value":
        [x for x in cat_cols_stripped if x not in list(labels["value"])],
        "o":
        np.nan
    })
    labels = labels.append(no_label, ignore_index=True)
    labels_dict = labels.set_index("value").T.to_dict("list")
    #labels_dict = labels.to_dict(orient="index")

    # Create all unique combinations from the URIs, order them alphabetically
    # and turn them into a DataFrame

    combinations = list(itertools.combinations(labels_dict.keys(), 2))
    combinations_sorted = [sorted(x) for x in combinations]

    result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

    # For each combination in this DataFrame, calculate the string similarity
    # of their results (i.e. labels)

    if progress:
        tqdm.pandas(
            desc="String Similarity Matching: Calculate String Similarities")
        result["value_string"] = result.progress_apply(
            lambda x: calc_string_similarity(x["uri_1"],
                                             x["uri_2"],
                                             labels_dict,
                                             metric=similarity_metric,
                                             n=n),
            axis=1)
    else:
        result["value_string"] = result.apply(lambda x: calc_string_similarity(
            x["uri_1"], x["uri_2"
                          ], labels_dict, metric=similarity_metric, n=n),
                                              axis=1)

    return result