Beispiel #1
0
 def test_list_like_extract(self):
     """We should be able to use a list-like object for choices."""
     def generate_choices():
         choices = ['a', 'Bb', 'CcC']
         for choice in choices:
             yield choice
     search = 'aaa'
     result = [(value, confidence) for value, confidence in
               process.extract(search, generate_choices())]
     self.assertTrue(len(result) > 0)
Beispiel #2
0
 def test_dict_like_extract(self):
     """We should be able to use a dict-like object for choices, not only a
     dict, and still get dict-like output.
     """
     try:
         from UserDict import UserDict
     except ImportError:
         from collections import UserDict
     choices = UserDict({'aa': 'bb', 'a1': None})
     search = 'aaa'
     result = process.extract(search, choices)
     self.assertTrue(len(result) > 0)
     for value, confidence, key in result:
         self.assertTrue(value in choices.values())
Beispiel #3
0
 def correct_spelling(self, nl_string):
     """ Perform basic spelling correction. Input is tokenized by whitespace, and an edit distance
         is computed for each token that is not an integer or hexidecimal value corresponding to
         a RGB color encoding.:w
     """
     # NOTE: Currently, we're doing spelling-correction before any other step
     #       (e.g. tokenization, lemmatization). TODO: This will likely change.
     corr_words = []
     # NOTE: Spaces will impact this whitespace-based tokenization + correction.
     for word in simple_tokens.findall(nl_string):
         # NOTE: (Incorrectly-entered) numbers will throw this off.
         if digit_or_rgbhex.match(
                 word
         ):  # For, e.g. "Undo previous 5" or colors in RGB hexidemical
             corr_words.append(word)
         else:
             # NOTE: score_cutoff is a parameter that could be tweaked.
             corr_word = process.extract(word.lower(),
                                         self.na_unigrams,
                                         scorer=fuzz.ratio)
             corr_word = sorted([
                 (score, [w for w, s in corr_word if s == score])
                 for score in set([s[1] for s in corr_word])
             ])[-1]
             if len(corr_word[1]) > 1:
                 if word in corr_word[1]:
                     corr_word = word
                 else:
                     corr_word = corr_word[1][0]
             else:
                 corr_word = corr_word[1][0]
             if corr_word:
                 corr_words.append(
                     corr_word)  # [0] is the word, [1] is its score
             # NOTE: Original implementation dropped words that are not in our "dictionary"
             else:
                 corr_words.append(word.lower())
     return ' '.join(corr_words)
Beispiel #4
0
    bazel_queries = {
        "test": [test_query],
        "run": [test_query, binary_query],
        "build": [library_query, test_query, binary_query],
    }

    # Run the appropriate bazel query and ask thefuzz to find the best matching
    # target, gauranteed to return 1 result because we set limit=1
    # Combine results of multiple queries with itertools.chain
    targets = list(
        itertools.chain.from_iterable([
            run(query, stdout=PIPE).stdout.split(b"\n")
            for query in bazel_queries[args.action]
        ]))
    target, confidence = process.extract(args.search_query, targets,
                                         limit=1)[0]
    target = str(target, encoding="utf-8")

    print("Found target {} with confidence {}".format(target, confidence))

    if args.interactive or confidence < THEFUZZ_MATCH_RATIO_THRESHOLD:
        filtered_targets = process.extract(args.search_query,
                                           targets,
                                           limit=NUM_FILTERED_MATCHES_TO_SHOW)
        targets = [filtered_target[0] for filtered_target in filtered_targets]
        target = str(iterfzf.iterfzf(iter(targets)), encoding="utf-8")
        print("User selected {}".format(target))

    command = ["bazel", args.action, target]

    # Trigger a debug build
# + tags=["parameters"]
upstream = ["retrieve"]
folder_output = None
# -


import os,os.path
import pandas as pd


params = pd.read_csv(os.path.join(folder_output,"params.txt"),sep="\t",encoding="utf-8")

from thefuzz import process

prms =params["field_clean"].unique()
tmp = []
for prm in prms:
    _m = process.extract(prm, prms, limit=5)
    for m in _m:
        tmp.append([prm,m[0],m[1]])
    
df = pd.DataFrame(tmp,columns=["field_clean","match","score"])
df.to_csv(os.path.join(folder_output,"terms","matched_params.txt"),sep="\t",index=False)    
Beispiel #6
0
# -*- coding:utf-8 -*-
"""
参考 https://github.com/seatgeek/thefuzz
"""
__author__ = "aaron.qiu"

from pprint import pprint
from thefuzz import fuzz
from thefuzz import process

if __name__ == '__main__':
    pprint(fuzz.ratio("this is a test", "this is a test!"))
    pprint(fuzz.partial_ratio("this is a test", "this is a test!"))
    pprint(fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear"))
    pprint(
        fuzz.token_sort_ratio("fuzzy wuzzy was a bear",
                              "wuzzy fuzzy was a bear"))
    pprint(fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
    pprint(fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear"))
    choices = [
        "Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"
    ]
    pprint(process.extract("new york jets", choices, limit=2))
    pprint(process.extractOne("cowboys", choices))
    songs = "/data/soft"
    pprint(process.extractOne("System of a down - Hypnotize - apache", songs))
    process.extractOne("System of a down - Hypnotize - Heroin",
                       songs,
                       scorer=fuzz.token_sort_ratio)
Beispiel #7
0
def map_list_to_catalog(data, catalog, output_format="dataframe", 
                        thr_accept=90, thr_reject=75, reject_value=None,
                        max_options=3, warnings=False, simplify=True):
    """
    Create an equivalence dictionary between data and a catalog.

    Take a 'data' list of dirty strings that will be mapped to a 'catalog' and return the equivalence dictionary with that mapping. Obtains a list of options with similarity scores. Uses fuzzywuzzy.process.extract().
    
    Parameters
    ----------
    data: list of str
        List of strings with the data to be mapped to the catalog
    catalog: list of str
        Catalog
    output_format: "dataframe" or "dictionary"
        If "dataframe" returns a pandas DataFrame where the first columns is the data and the following columns are the options and their scores
        If "dictionary" returns an dictionary { data_item : catalog_item }
    thr_accept: int, default '90'
        Treshold value for acceptance. If the top score is higher than this the top value will be assigned and no options shown
    thr_reject: int, default '75'
        Treshold value for rejection. If the top score is lower than this value no equivalence to the catalog will be created and the equivalence will be replaced by reject_value
    reject_value: str, default 'None'
        Value that will be return in "dictionary" if the score is lower than thr_reject
    max_options: int, default 3
        Number of options to display
    warnings: bool
        If True raises warnings for values between thr_accept and thr_reject
    simplify: bool
        Simplify every string before comparison to improve score


    Returns
    -------
    dataframe 
        pandas DataFrame where the first column is the data and the following columns are the options and their scores
    dictionary
        Dictionary that allows mapping options to the catalog

    Examples
    --------
    >>> catalog = ["Mouse", "Cat", "Dog", "Human"]
    >>> data = ["mice",  "CAT ", "doggo", "PERSON", 999]
    >>> ww.map_list_to_catalog(data, catalog, thr_accept=95, thr_reject=40)
        Data    Option1     Score1  Option2     Score2  Option3     Score3
    0   CAT     Cat     100     None    NaN     None    NaN
    1   doggo   Dog     90  Mouse   20.0    Human   0.0
    2   mice    Mouse   44  Cat     29.0    Human   22.0
    3   PERSON  PERSON  0   None    NaN     None    NaN
    4   999     999     0   None    NaN     None    NaN

    >>> ww.map_list_to_catalog(data, catalog, output_format="dictionary", reject_value='Other')
    {'mice':'Other', 999:999, 'doggo':'Dog', 'PERSON':'Other', 'CAT ':'Cat'}

    """

    
    if thr_reject>=thr_accept:
        raise ValueError("Invalid threshold values")
    if output_format=="dictionary": res = {}
    elif output_format=="dataframe": res = []
    else:raise NameError("output_format should be 'dictionary' or 'dataframe'")

    #Format and save to dic to return to original values
    data_ = list(set(data))
    catalog_ = list(set(catalog))
    if simplify:
        catalog_ = {simplify_string(s):s for s in catalog_}

    #iterate
    for item in data_:
        #check type
        if type(item)!=str: 
            options = [(item,0)]
        else:
            if simplify:
                options = process.extract( simplify_string(item), catalog_.keys(), limit=max_options )
                options = [ ( catalog_[i[0]] , i[1] ) for i in options]
            else: 
                options = process.extract(item, catalog_, limit=max_options)
            top_score = options[0][1]
            if top_score>=thr_accept:
                options = [options[0]]
            elif top_score<thr_reject:
                if warnings:
                    warn_review("REJECT", item, options)
                if reject_value==None: options = [(item,0)]
                else: options = [(reject_value,0)]
            elif warnings:
                warn_review("WOBBLY", item, options)
        logging.info(item,options)
        if output_format=="dataframe":
            res.append( [item]+unnest(options) )
        if output_format=="dictionary":
            res[item]=options[0][0]

    # convert to dataframe
    if output_format=="dataframe":
        res = DataFrame(res)
        # create column names
        n_cols = int( (len(res.columns)+1)/2 )
        cols = [str(i) for i in range(1,n_cols)]
        cols = [["Option"+i, "Score"+i] for i in cols]
        res.columns = ['Data'] + unnest(cols)
        # sort
        res = res.sort_values(["Score1","Data"], ascending=[False,False]) \
                 .reset_index(drop=True)
    return res
Beispiel #8
0
def cluster_strings(data, thr_accept=90, max_options=None, simplify=True, randomize=True):
    """
    Cluster similar strings.

    Take a 'data' list of strings and create clusters of those whose levenshtein distance is higher than thr_accept. This is a rough algoritm, once a cluster has been formed none of its elements will belong to an other cluster.
    
    Parameters
    ----------
    data: list of str
        List of strings with the data to be mapped to the catalog
    thr_accept: int, default '90'
        Treshold value for acceptance. If the top score is higher than this the top value will be assigned and no options shown
    max_options: int, default None
        Number of elementos of cluster to return. If None returns all options, else it returns N options.
    simplify: bool
        Simplify every string before comparison to improve score
    simplify: bool
        Shuffle data, the order of the list affects cluster formation.


    Returns
    -------
    list of lists
        List of lists where each list contains the strings that clustered together by levenshtein distance.

    Examples
    --------
    >>> catalog = ["Mouse", "Cat", "Dog", "Human"]
    >>> data = ["mice",  "CAT ", "doggo", "PERSON", 999]
    >>> ww.map_list_to_catalog(data, catalog, thr_accept=95, thr_reject=40)
        Data    Option1     Score1  Option2     Score2  Option3     Score3
    0   CAT     Cat     100     None    NaN     None    NaN
    1   doggo   Dog     90  Mouse   20.0    Human   0.0
    2   mice    Mouse   44  Cat     29.0    Human   22.0
    3   PERSON  PERSON  0   None    NaN     None    NaN
    4   999     999     0   None    NaN     None    NaN

    >>> ww.map_list_to_catalog(data, catalog, output_format="dictionary", reject_value='Other')
    {'mice':'Other', 999:999, 'doggo':'Dog', 'PERSON':'Other', 'CAT ':'Cat'}

    """
    if simplify: # create a dictionary of strings that have been simplified to the same place
        dic_simple = {}
        for old in set(data):
            if type(old)==str:
                new = simplify_string(old)
                if new in dic_simple.keys():
                    dic_simple[new].append(old)
                else: dic_simple[new] = [old]
        logging.info("Save string simplification {}".format(dic_simple))
        data_ = list(dic_simple.keys())
    else: data_ = list(set(data))

    # because list order affects cluster formation
    if randomize: shuffle(data_)
    
    # group if similar enough
    groups = []
    while len(data_)>1:
        target = data_.pop()
        if type(target)==str:
            group = process.extract(target,data_)
            group = [t for t,s in group if s>=thr_accept]
            for s in group: #don't match twice
                data_.remove(s)
        else: group = []
        groups.append([target] + group)
    logging.info("Group by fuzz distance {}".format(groups))
    
    if simplify:
        groups = [[dic_simple[g] for g in group] for group in groups]
        groups = [unnest(group) for group in groups]
        logging.info("Return to original string {}".format(groups))
    
    if max_options!=None:
        groups = [group[0:max_options] for group in groups]
        if max_options==1:
            groups = [group[0] for group in groups]
        logging.info("Return only top N options {}".format(groups))
    
    return(groups)