コード例 #1
0
    def test3_partialmatch(self):

        metrics = [
            "norm_levenshtein", "partial_levenshtein",
            "token_sort_levenshtein", "token_set_levenshtein", "ngram",
            "jaccard"
        ]

        uriA = "https://test.me/A"
        uriB = "https://test.me/B"

        str_dict = {
            "https://test.me/A": "Hello this is a test string.",
            "https://test.me/B": "Hello this is another test string."
        }

        results = []

        results_exp = [
            0.9, 0.79, 0.9, 0.96, 1 - 0.19117647058823528,
            1 - 0.24137931034482762
        ]

        for metric in metrics:

            result = calc_string_similarity(uri_1=uriA,
                                            uri_2=uriB,
                                            label_dict=str_dict,
                                            metric=metric)

            results.append(result)

        assert results == results_exp
コード例 #2
0
    def test2_nan(self):

        metrics = [
            "norm_levenshtein", "partial_levenshtein",
            "token_sort_levenshtein", "token_set_levenshtein", "ngram",
            "jaccard"
        ]

        uriA = "https://test.me/A"
        uriB = "https://test.me/B"

        str_dict = {
            "https://test.me/A": "Hello this is a test string.",
            "https://test.me/B": np.nan
        }

        results = []

        results_exp = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]

        for metric in metrics:

            result = calc_string_similarity(uri_1=uriA,
                                            uri_2=uriB,
                                            label_dict=str_dict,
                                            metric=metric)

            results.append(result)

        assert results == results_exp
コード例 #3
0
    def test5_wrongmetric(self):

        with pytest.raises(ValueError):

            metric = "nonexisting_metric"

            uriA = "https://test.me/A"
            uriB = "https://test.me/B"

            str_dict = {
                "https://test.me/A": "Hello this is a test string.",
                "https://test.me/B": "Hello this is a test string."
            }

            calc_string_similarity(uri_1=uriA,
                                   uri_2=uriB,
                                   label_dict=str_dict,
                                   metric=metric)
コード例 #4
0
    def test4_custommetric(self):

        metric = Levenshtein().distance

        uriA = "https://test.me/A"
        uriB = "https://test.me/B"

        str_dict = {
            "https://test.me/A": "Hello this is a test string.",
            "https://test.me/B": "Hello this is another test string."
        }

        result_exp = 6

        result = calc_string_similarity(uri_1=uriA,
                                        uri_2=uriB,
                                        label_dict=str_dict,
                                        metric=metric)

        assert result == result_exp
コード例 #5
0
def string_similarity_matching(df,
                               predicate="rdfs:label",
                               to_lowercase=True,
                               remove_prefixes=True,
                               remove_punctuation=True,
                               similarity_metric="norm_levenshtein",
                               prefix_threshold=1,
                               n=2,
                               progress=True,
                               caching=True):
    """Calculates the string similarity from the text field obtained by
    querying the attributes for the predicate, by default rdfs:label.

    Args:
        df (pd.DataFrame): Dataframe where matching attributes are supposed to
            be found
        predicate (str, optional):  Defaults to "rdfs:label".
        to_lowercase (bool, optional): converts queried strings to lowercase.
            Defaults to True.
        remove_prefixes (bool, optional): removes prefices of queried strings.
            Defaults to True.
        remove_punctuation (bool, optional): removes punctuation from queried
            strings. Defaults to True.
        similarity_metric (str, optional): norm by which strings are compared.
            Defaults to "norm_levenshtein".
        prefix_threshold (int, optional): The number of occurences after which
            a prefix is considered "common". defaults to 1. n (int, optional):
            parameter for n-gram and Jaccard similarities. Defaults to 2.
        progress (bool, optional): If True, progress bars will be shown to 
            inform the user about the progress made by the process. Defaults to 
            True.
        caching (bool, optional): Turn result-caching for queries issued during 
            the execution on or off. Defaults to True.

    Returns:
        pd.DataFrame: Two columns with matching links and a third column with
        the string similarity score.
    """

    # Get URIs from the column names

    cat_cols = [col for col in df.columns if re.findall("https*:", col)]
    cat_cols_stripped = [
        re.sub(r"^.*http://", "http://", col) for col in cat_cols
    ]

    # Query these URIs for the predicate (usually the label)

    query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value "
    query += predicate + " ?o. FILTER (lang(?o) = 'en') }"

    labels = uri_querier(pd.DataFrame(cat_cols_stripped),
                         0,
                         query,
                         progress=progress,
                         caching=caching).set_index("value")

    # Get common prefixes

    common_prefixes = get_common_prefixes(labels, prefix_threshold)

    # Clean the results (i.e. the labels)

    labels["o"] = labels["o"].apply(lambda x: clean_string(
        x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation))

    # Create a dictionary that maps the URIs to their result (i.e. label)

    labels.reset_index(inplace=True)
    no_label = pd.DataFrame({
        "value":
        [x for x in cat_cols_stripped if x not in list(labels["value"])],
        "o":
        np.nan
    })
    labels = labels.append(no_label, ignore_index=True)
    labels_dict = labels.set_index("value").T.to_dict("list")
    #labels_dict = labels.to_dict(orient="index")

    # Create all unique combinations from the URIs, order them alphabetically
    # and turn them into a DataFrame

    combinations = list(itertools.combinations(labels_dict.keys(), 2))
    combinations_sorted = [sorted(x) for x in combinations]

    result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"])

    # For each combination in this DataFrame, calculate the string similarity
    # of their results (i.e. labels)

    if progress:
        tqdm.pandas(
            desc="String Similarity Matching: Calculate String Similarities")
        result["value_string"] = result.progress_apply(
            lambda x: calc_string_similarity(x["uri_1"],
                                             x["uri_2"],
                                             labels_dict,
                                             metric=similarity_metric,
                                             n=n),
            axis=1)
    else:
        result["value_string"] = result.apply(lambda x: calc_string_similarity(
            x["uri_1"], x["uri_2"
                          ], labels_dict, metric=similarity_metric, n=n),
                                              axis=1)

    return result