def test6_missing(self): common_prefixes = ["NameofTestElement", "ABC"] string = np.nan result = clean_string(string, common_prefixes=common_prefixes, to_lowercase=True, remove_prefixes=True, remove_punctuation=True) assert pd.isna(result)
def test3_lower(self): common_prefixes = [] string = "NameofTestElement:1_2" result_exp = "nameoftestelement:1_2" result = clean_string(string, common_prefixes=common_prefixes, to_lowercase=True, remove_prefixes=False, remove_punctuation=False) assert result_exp == result
def test2_all_noprefix_colon(self): common_prefixes = [] string = "NameofTestElement:1" result_exp = "nameoftestelement1" result = clean_string(string, common_prefixes=common_prefixes, to_lowercase=True, remove_prefixes=True, remove_punctuation=True) assert result_exp == result
def test1_all_multiplecolons(self): common_prefixes = ["Special_Prefix_Thing"] string = "Special_Prefix_Thing:NameofTestElement:1" result_exp = "nameoftestelement1" result = clean_string(string, common_prefixes=common_prefixes, to_lowercase=True, remove_prefixes=True, remove_punctuation=True) assert result_exp == result
def test5_prefixes(self): common_prefixes = ["NameofTestElement", "ABC"] string = "NameofTestElement:1_2" result_exp = "1_2" result = clean_string(string, common_prefixes=common_prefixes, to_lowercase=False, remove_prefixes=True, remove_punctuation=False) assert result_exp == result
def label_schema_matching(df, endpoint=DBpedia, uri_data_model=False, to_lowercase=True, remove_prefixes=True, remove_punctuation=True, prefix_threshold=1, progress=True, caching=True): """A schema matching method by checking for attribute -- rdfs:label between links. Args: df (pd.DataFrame): The dataframe where matching attributes are supposed to be found. endpoint (Endpoint, optional): SPARQL Endpoint to be queried. Defaults to DBpedia. uri_data_model (bool, optional): If enabled, the URI is directly queried instead of a SPARQL endpoint. Defaults to False. to_lowercase (bool, optional): Converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): Removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): Removes punctuation from queried strings. Defaults to True. prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". Defaults to 1. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process (if "uri_data_model" = True). Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the overlapped label. """ matches = pd.DataFrame(columns=["uri_1", "uri_2", "same_label"]) # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # transform attributes to sparql values list form values = "(<" + pd.Series(cat_cols_stripped).str.cat(sep=">) (<") + ">) " if uri_data_model: # Query these URIs for the label query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier( pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).drop_duplicates().set_index("value") else: query = "SELECT ?value ?o WHERE {VALUES (?value) {" + values + \ "} ?value rdfs:label ?o. FILTER (lang(?o) = 'en') }" # query the equivalent classes/properties labels = endpoint_wrapper(query, endpoint, caching=caching).reset_index(drop=True) if labels.empty: return matches # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary if labels.index.name == "value": labels.reset_index(inplace=True) labels_dict = labels.set_index("value").T.to_dict("list") #check if there are no matches tmp = set() for v in labels_dict.values(): tmp.update(v) if len(labels_dict) == len(tmp): combinations = list(itertools.combinations(cat_cols_stripped, 2)) combinations_sorted = [sorted(x) for x in combinations] matches = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) matches["same_label"] = 0 return matches else: # Combine the uris that have the same labels into a DataFrame new_labels_dict = collections.defaultdict(list) for key, values in labels_dict.items(): for i in values: new_labels_dict[i].append(key) df_labels = pd.DataFrame(list(new_labels_dict.values()), columns=["uri_1", "uri_2"]) #df_labels["same_label"] = pd.DataFrame(list(new_labels_dict.keys())) df_labels.dropna(inplace=True) # restrict the order of uris in one row for _, row in df_labels.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 1 } matches = matches.append(new_match, ignore_index=True) # Get back the uris that are not quired by rdfs:label and turn df into dict no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) full_labels_dict = labels.set_index("value").T.to_dict("list") # Create all unique combinations from the URIs, order them alphabetically and turn them into a DataFrame combinations = list(itertools.combinations(full_labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # merged with the non_matched combinations and drop duplicates for _, row in result.iterrows(): new_match = { "uri_1": min(row["uri_1"], row["uri_2"]), "uri_2": max(row["uri_1"], row["uri_2"]), "same_label": 0 } matches = matches.append(new_match, ignore_index=True) matches.drop_duplicates(subset=["uri_1", "uri_2"], inplace=True, ignore_index=True) return matches
def string_similarity_matching(df, predicate="rdfs:label", to_lowercase=True, remove_prefixes=True, remove_punctuation=True, similarity_metric="norm_levenshtein", prefix_threshold=1, n=2, progress=True, caching=True): """Calculates the string similarity from the text field obtained by querying the attributes for the predicate, by default rdfs:label. Args: df (pd.DataFrame): Dataframe where matching attributes are supposed to be found predicate (str, optional): Defaults to "rdfs:label". to_lowercase (bool, optional): converts queried strings to lowercase. Defaults to True. remove_prefixes (bool, optional): removes prefices of queried strings. Defaults to True. remove_punctuation (bool, optional): removes punctuation from queried strings. Defaults to True. similarity_metric (str, optional): norm by which strings are compared. Defaults to "norm_levenshtein". prefix_threshold (int, optional): The number of occurences after which a prefix is considered "common". defaults to 1. n (int, optional): parameter for n-gram and Jaccard similarities. Defaults to 2. progress (bool, optional): If True, progress bars will be shown to inform the user about the progress made by the process. Defaults to True. caching (bool, optional): Turn result-caching for queries issued during the execution on or off. Defaults to True. Returns: pd.DataFrame: Two columns with matching links and a third column with the string similarity score. """ # Get URIs from the column names cat_cols = [col for col in df.columns if re.findall("https*:", col)] cat_cols_stripped = [ re.sub(r"^.*http://", "http://", col) for col in cat_cols ] # Query these URIs for the predicate (usually the label) query = "SELECT ?value ?o WHERE {VALUES (?value) {(<**URI**>)} ?value " query += predicate + " ?o. FILTER (lang(?o) = 'en') }" labels = uri_querier(pd.DataFrame(cat_cols_stripped), 0, query, progress=progress, caching=caching).set_index("value") # Get common prefixes common_prefixes = get_common_prefixes(labels, prefix_threshold) # Clean the results (i.e. the labels) labels["o"] = labels["o"].apply(lambda x: clean_string( x, common_prefixes, to_lowercase, remove_prefixes, remove_punctuation)) # Create a dictionary that maps the URIs to their result (i.e. label) labels.reset_index(inplace=True) no_label = pd.DataFrame({ "value": [x for x in cat_cols_stripped if x not in list(labels["value"])], "o": np.nan }) labels = labels.append(no_label, ignore_index=True) labels_dict = labels.set_index("value").T.to_dict("list") #labels_dict = labels.to_dict(orient="index") # Create all unique combinations from the URIs, order them alphabetically # and turn them into a DataFrame combinations = list(itertools.combinations(labels_dict.keys(), 2)) combinations_sorted = [sorted(x) for x in combinations] result = pd.DataFrame(combinations_sorted, columns=["uri_1", "uri_2"]) # For each combination in this DataFrame, calculate the string similarity # of their results (i.e. labels) if progress: tqdm.pandas( desc="String Similarity Matching: Calculate String Similarities") result["value_string"] = result.progress_apply( lambda x: calc_string_similarity(x["uri_1"], x["uri_2"], labels_dict, metric=similarity_metric, n=n), axis=1) else: result["value_string"] = result.apply(lambda x: calc_string_similarity( x["uri_1"], x["uri_2" ], labels_dict, metric=similarity_metric, n=n), axis=1) return result