def test3_partialmatch(self): metrics = [ "norm_levenshtein", "partial_levenshtein", "token_sort_levenshtein", "token_set_levenshtein", "ngram", "jaccard" ] uriA = "https://test.me/A" uriB = "https://test.me/B" str_dict = { "https://test.me/A": "Hello this is a test string.", "https://test.me/B": "Hello this is another test string." } results = [] results_exp = [ 0.9, 0.79, 0.9, 0.96, 1 - 0.19117647058823528, 1 - 0.24137931034482762 ] for metric in metrics: result = calc_string_similarity(uri_1=uriA, uri_2=uriB, label_dict=str_dict, metric=metric) results.append(result) assert results == results_exp
def test2_nan(self): metrics = [ "norm_levenshtein", "partial_levenshtein", "token_sort_levenshtein", "token_set_levenshtein", "ngram", "jaccard" ] uriA = "https://test.me/A" uriB = "https://test.me/B" str_dict = { "https://test.me/A": "Hello this is a test string.", "https://test.me/B": np.nan } results = [] results_exp = [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] for metric in metrics: result = calc_string_similarity(uri_1=uriA, uri_2=uriB, label_dict=str_dict, metric=metric) results.append(result) assert results == results_exp
def test5_wrongmetric(self): with pytest.raises(ValueError): metric = "nonexisting_metric" uriA = "https://test.me/A" uriB = "https://test.me/B" str_dict = { "https://test.me/A": "Hello this is a test string.", "https://test.me/B": "Hello this is a test string." } calc_string_similarity(uri_1=uriA, uri_2=uriB, label_dict=str_dict, metric=metric)
def test4_custommetric(self): metric = Levenshtein().distance uriA = "https://test.me/A" uriB = "https://test.me/B" str_dict = { "https://test.me/A": "Hello this is a test string.", "https://test.me/B": "Hello this is another test string." } result_exp = 6 result = calc_string_similarity(uri_1=uriA, uri_2=uriB, label_dict=str_dict, metric=metric) assert result == result_exp
def string_similarity_matching(df, predicate="rdfs:label", to_lowercase=True, remove_prefixes=True, remove_punctuation=True, similarity_metric="norm_levenshtein", prefix_threshold=1, n=2, progress=True, caching=True): """Calculates the string similarity from the text field obtained by querying the attributes for the predicate, by default rdfs:label. Args: df (pd.DataFrame): Dataframe where matching attributes are supposed to be found predicate (str, optional): Defaults to "rdfs:label". to_lowercase (bool, optional): converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): removes punctuation from queried strings. Defaults to True. similarity_metric (str, optional): norm by which strings are compared. Defaults to "norm_levenshtein". prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". defaults to 1. n (int, optional): parameter for n-gram and Jaccard similarities. Defaults to 2. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the string similarity score. """ # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # Query these URIs for the predicate (usually the label) query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value " query += predicate + " ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier(pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).set_index("value") # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary that maps the URIs to their result (i.e. label) labels.reset_index(inplace=True) no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) labels_dict = labels.set_index("value").T.to_dict("list") #labels_dict = labels.to_dict(orient="index") # Create all unique combinations from the URIs, order them alphabetically # and turn them into a DataFrame combinations = list(itertools.combinations(labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # For each combination in this DataFrame, calculate the string similarity # of their results (i.e. labels) if progress: tqdm.pandas( desc="String Similarity Matching: Calculate String Similarities") result["value_string"] = result.progress_apply( lambda x: calc_string_similarity(x["uri_1"], x["uri_2"], labels_dict, metric=similarity_metric, n=n), axis=1) else: result["value_string"] = result.apply(lambda x: calc_string_similarity( x["uri_1"], x["uri_2" ], labels_dict, metric=similarity_metric, n=n), axis=1) return result