def test_list_like_extract(self): """We should be able to use a list-like object for choices.""" def generate_choices(): choices = ['a', 'Bb', 'CcC'] for choice in choices: yield choice search = 'aaa' result = [(value, confidence) for value, confidence in process.extract(search, generate_choices())] self.assertTrue(len(result) > 0)
def test_dict_like_extract(self): """We should be able to use a dict-like object for choices, not only a dict, and still get dict-like output. """ try: from UserDict import UserDict except ImportError: from collections import UserDict choices = UserDict({'aa': 'bb', 'a1': None}) search = 'aaa' result = process.extract(search, choices) self.assertTrue(len(result) > 0) for value, confidence, key in result: self.assertTrue(value in choices.values())
def correct_spelling(self, nl_string): """ Perform basic spelling correction. Input is tokenized by whitespace, and an edit distance is computed for each token that is not an integer or hexidecimal value corresponding to a RGB color encoding.:w """ # NOTE: Currently, we're doing spelling-correction before any other step # (e.g. tokenization, lemmatization). TODO: This will likely change. corr_words = [] # NOTE: Spaces will impact this whitespace-based tokenization + correction. for word in simple_tokens.findall(nl_string): # NOTE: (Incorrectly-entered) numbers will throw this off. if digit_or_rgbhex.match( word ): # For, e.g. "Undo previous 5" or colors in RGB hexidemical corr_words.append(word) else: # NOTE: score_cutoff is a parameter that could be tweaked. corr_word = process.extract(word.lower(), self.na_unigrams, scorer=fuzz.ratio) corr_word = sorted([ (score, [w for w, s in corr_word if s == score]) for score in set([s[1] for s in corr_word]) ])[-1] if len(corr_word[1]) > 1: if word in corr_word[1]: corr_word = word else: corr_word = corr_word[1][0] else: corr_word = corr_word[1][0] if corr_word: corr_words.append( corr_word) # [0] is the word, [1] is its score # NOTE: Original implementation dropped words that are not in our "dictionary" else: corr_words.append(word.lower()) return ' '.join(corr_words)
bazel_queries = { "test": [test_query], "run": [test_query, binary_query], "build": [library_query, test_query, binary_query], } # Run the appropriate bazel query and ask thefuzz to find the best matching # target, gauranteed to return 1 result because we set limit=1 # Combine results of multiple queries with itertools.chain targets = list( itertools.chain.from_iterable([ run(query, stdout=PIPE).stdout.split(b"\n") for query in bazel_queries[args.action] ])) target, confidence = process.extract(args.search_query, targets, limit=1)[0] target = str(target, encoding="utf-8") print("Found target {} with confidence {}".format(target, confidence)) if args.interactive or confidence < THEFUZZ_MATCH_RATIO_THRESHOLD: filtered_targets = process.extract(args.search_query, targets, limit=NUM_FILTERED_MATCHES_TO_SHOW) targets = [filtered_target[0] for filtered_target in filtered_targets] target = str(iterfzf.iterfzf(iter(targets)), encoding="utf-8") print("User selected {}".format(target)) command = ["bazel", args.action, target] # Trigger a debug build
# + tags=["parameters"] upstream = ["retrieve"] folder_output = None # - import os,os.path import pandas as pd params = pd.read_csv(os.path.join(folder_output,"params.txt"),sep="\t",encoding="utf-8") from thefuzz import process prms =params["field_clean"].unique() tmp = [] for prm in prms: _m = process.extract(prm, prms, limit=5) for m in _m: tmp.append([prm,m[0],m[1]]) df = pd.DataFrame(tmp,columns=["field_clean","match","score"]) df.to_csv(os.path.join(folder_output,"terms","matched_params.txt"),sep="\t",index=False)
# -*- coding:utf-8 -*- """ 参考 https://github.com/seatgeek/thefuzz """ __author__ = "aaron.qiu" from pprint import pprint from thefuzz import fuzz from thefuzz import process if __name__ == '__main__': pprint(fuzz.ratio("this is a test", "this is a test!")) pprint(fuzz.partial_ratio("this is a test", "this is a test!")) pprint(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) pprint( fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")) pprint(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) pprint(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")) choices = [ "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys" ] pprint(process.extract("new york jets", choices, limit=2)) pprint(process.extractOne("cowboys", choices)) songs = "/data/soft" pprint(process.extractOne("System of a down - Hypnotize - apache", songs)) process.extractOne("System of a down - Hypnotize - Heroin", songs, scorer=fuzz.token_sort_ratio)
def map_list_to_catalog(data, catalog, output_format="dataframe", thr_accept=90, thr_reject=75, reject_value=None, max_options=3, warnings=False, simplify=True): """ Create an equivalence dictionary between data and a catalog. Take a 'data' list of dirty strings that will be mapped to a 'catalog' and return the equivalence dictionary with that mapping. Obtains a list of options with similarity scores. Uses fuzzywuzzy.process.extract(). Parameters ---------- data: list of str List of strings with the data to be mapped to the catalog catalog: list of str Catalog output_format: "dataframe" or "dictionary" If "dataframe" returns a pandas DataFrame where the first columns is the data and the following columns are the options and their scores If "dictionary" returns an dictionary { data_item : catalog_item } thr_accept: int, default '90' Treshold value for acceptance. If the top score is higher than this the top value will be assigned and no options shown thr_reject: int, default '75' Treshold value for rejection. If the top score is lower than this value no equivalence to the catalog will be created and the equivalence will be replaced by reject_value reject_value: str, default 'None' Value that will be return in "dictionary" if the score is lower than thr_reject max_options: int, default 3 Number of options to display warnings: bool If True raises warnings for values between thr_accept and thr_reject simplify: bool Simplify every string before comparison to improve score Returns ------- dataframe pandas DataFrame where the first column is the data and the following columns are the options and their scores dictionary Dictionary that allows mapping options to the catalog Examples -------- >>> catalog = ["Mouse", "Cat", "Dog", "Human"] >>> data = ["mice", "CAT ", "doggo", "PERSON", 999] >>> ww.map_list_to_catalog(data, catalog, thr_accept=95, thr_reject=40) Data Option1 Score1 Option2 Score2 Option3 Score3 0 CAT Cat 100 None NaN None NaN 1 doggo Dog 90 Mouse 20.0 Human 0.0 2 mice Mouse 44 Cat 29.0 Human 22.0 3 PERSON PERSON 0 None NaN None NaN 4 999 999 0 None NaN None NaN >>> ww.map_list_to_catalog(data, catalog, output_format="dictionary", reject_value='Other') {'mice':'Other', 999:999, 'doggo':'Dog', 'PERSON':'Other', 'CAT ':'Cat'} """ if thr_reject>=thr_accept: raise ValueError("Invalid threshold values") if output_format=="dictionary": res = {} elif output_format=="dataframe": res = [] else:raise NameError("output_format should be 'dictionary' or 'dataframe'") #Format and save to dic to return to original values data_ = list(set(data)) catalog_ = list(set(catalog)) if simplify: catalog_ = {simplify_string(s):s for s in catalog_} #iterate for item in data_: #check type if type(item)!=str: options = [(item,0)] else: if simplify: options = process.extract( simplify_string(item), catalog_.keys(), limit=max_options ) options = [ ( catalog_[i[0]] , i[1] ) for i in options] else: options = process.extract(item, catalog_, limit=max_options) top_score = options[0][1] if top_score>=thr_accept: options = [options[0]] elif top_score<thr_reject: if warnings: warn_review("REJECT", item, options) if reject_value==None: options = [(item,0)] else: options = [(reject_value,0)] elif warnings: warn_review("WOBBLY", item, options) logging.info(item,options) if output_format=="dataframe": res.append( [item]+unnest(options) ) if output_format=="dictionary": res[item]=options[0][0] # convert to dataframe if output_format=="dataframe": res = DataFrame(res) # create column names n_cols = int( (len(res.columns)+1)/2 ) cols = [str(i) for i in range(1,n_cols)] cols = [["Option"+i, "Score"+i] for i in cols] res.columns = ['Data'] + unnest(cols) # sort res = res.sort_values(["Score1","Data"], ascending=[False,False]) \ .reset_index(drop=True) return res
def cluster_strings(data, thr_accept=90, max_options=None, simplify=True, randomize=True): """ Cluster similar strings. Take a 'data' list of strings and create clusters of those whose levenshtein distance is higher than thr_accept. This is a rough algoritm, once a cluster has been formed none of its elements will belong to an other cluster. Parameters ---------- data: list of str List of strings with the data to be mapped to the catalog thr_accept: int, default '90' Treshold value for acceptance. If the top score is higher than this the top value will be assigned and no options shown max_options: int, default None Number of elementos of cluster to return. If None returns all options, else it returns N options. simplify: bool Simplify every string before comparison to improve score simplify: bool Shuffle data, the order of the list affects cluster formation. Returns ------- list of lists List of lists where each list contains the strings that clustered together by levenshtein distance. Examples -------- >>> catalog = ["Mouse", "Cat", "Dog", "Human"] >>> data = ["mice", "CAT ", "doggo", "PERSON", 999] >>> ww.map_list_to_catalog(data, catalog, thr_accept=95, thr_reject=40) Data Option1 Score1 Option2 Score2 Option3 Score3 0 CAT Cat 100 None NaN None NaN 1 doggo Dog 90 Mouse 20.0 Human 0.0 2 mice Mouse 44 Cat 29.0 Human 22.0 3 PERSON PERSON 0 None NaN None NaN 4 999 999 0 None NaN None NaN >>> ww.map_list_to_catalog(data, catalog, output_format="dictionary", reject_value='Other') {'mice':'Other', 999:999, 'doggo':'Dog', 'PERSON':'Other', 'CAT ':'Cat'} """ if simplify: # create a dictionary of strings that have been simplified to the same place dic_simple = {} for old in set(data): if type(old)==str: new = simplify_string(old) if new in dic_simple.keys(): dic_simple[new].append(old) else: dic_simple[new] = [old] logging.info("Save string simplification {}".format(dic_simple)) data_ = list(dic_simple.keys()) else: data_ = list(set(data)) # because list order affects cluster formation if randomize: shuffle(data_) # group if similar enough groups = [] while len(data_)>1: target = data_.pop() if type(target)==str: group = process.extract(target,data_) group = [t for t,s in group if s>=thr_accept] for s in group: #don't match twice data_.remove(s) else: group = [] groups.append([target] + group) logging.info("Group by fuzz distance {}".format(groups)) if simplify: groups = [[dic_simple[g] for g in group] for group in groups] groups = [unnest(group) for group in groups] logging.info("Return to original string {}".format(groups)) if max_options!=None: groups = [group[0:max_options] for group in groups] if max_options==1: groups = [group[0] for group in groups] logging.info("Return only top N options {}".format(groups)) return(groups)