def execute(self, command): ReturnedJson = json.loads( self._nlp.annotate( command, properties=self._nlp_properties))['sentences'][0] dependencies = [] for dependency in ReturnedJson['enhancedPlusPlusDependencies']: dependencies.append( (dependency['governorGloss'], dependency['dependentGloss'], dependency['dep'])) verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0] functions = FuzzySet(self.function_names) query = functions.get(str(verb_tuple[0])) if query: query = query[0] print(query) if query[0] < .5: print("learning") self._learn(command) else: dyn = self.functions.get(query[1]) obj = verb_tuple[1] obj_adj = str(" ".join([ x[1] for x in dependencies if x[0] == obj and not x[2] == u'det' ]) + " " + obj) objects = FuzzySet(world.objects) object_query = objects.get(obj_adj) if object_query: dyn(world.attributes.get(object_query[0][1])) else: print("no objects found!") else: self._learn(command)
def matchedIds(postId, threshold): keywords = Model.getKeywords() postKeywords = list(filter(lambda x: x["id"] == postId, keywords))[0]["keywords"] matches = [] for keyword in keywords: fs = FuzzySet(keyword['keywords']) for pk in postKeywords: if postId != keyword["id"]: m = fs.get(pk) if m: for score, val in fs.get(pk): if score > threshold: matches.append((keyword["id"], score, val)) return matches
class LocalParallelFuzzyCnpjMatcher(BaseParallelFuzzyCnpjMatcher): def __init__(self, cpu_count="autodetect"): super(LocalParallelFuzzyCnpjMatcher, self).__init__() self.__job_server = pp.Server(ncpus=cpu_count) def match_cnpj(self, cnpj, debug=False): best_matches = [] # temp variables start_time = time.time() jobs = [( cnpj_base_str, self.__job_server.submit( fuzzy_cnpj_search, (cnpj_base_str, cnpj, debug,), (log, ), ("from fuzzyset import FuzzySet", "time") )) for cnpj_base_str in self.cnpj_bases] for cnpj_base_str, job in jobs: print "Results", cnpj_base_str, "is", job() elapsed_time = time.time() - start_time log('Parallel processes took %d seconds to finish' % elapsed_time, debug) # Performing Fuzzy string match on the best results of each cnpj base file self.fuzzy_matcher = FuzzySet(best_matches) return self.fuzzy_matcher.get(cnpj)[0]
class FuzzyBaseIndex(object): def __init__(self, field=None, similarity=None, base=None, **kw): super(FuzzyBaseIndex, self).__init__(**kw) self.fuzz = FuzzySet(rel_sim_cutoff=1., use_levenshtein=False) self.content = {} self.field = field self.similarity = similarity self.base = base def add(self, x, i): self.fuzz.add(x) if x not in self.content: self.content[x] = set() self.content[x].add(i) def finalize(self): pass def search(self, x, top=25, debug=True): results = self.fuzz.get(x) ret = [] for r in results: for i in self.content[r[1]]: sim = self.similarity(x, r[1]) ret.append((i, r[0], sim)) ret = sorted(ret, key=lambda x: x[2], reverse=True) ret = ret[:top] return ret
def get_oov_vocabulary_map(vocabulary_words_weights, wordset): oov = wordset - set(vocabulary_words_weights.keys()) vocabulary_words_set = FuzzySet(sorted(vocabulary_words_weights.keys())) mapping = {} for word in tqdm(oov): word_matches = vocabulary_words_set.get(word) if word_matches is None or len(word_matches) == 0: continue word_scores = {vocabulary_word: score * vocabulary_words_weights[vocabulary_word] for score, vocabulary_word in word_matches} vocabulary_words_scored = sorted(word_scores.keys(), key=lambda vocabulary_word: -word_scores[vocabulary_word]) mapping[word] = vocabulary_words_scored[0] return mapping
def fuzzy_cnpj_search(cnpj_base_str, cnpj, debug=False): best_matches = [] with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug) fuzzy_matcher = FuzzySet(f.read().splitlines()) match = fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time log('Best match for this file is %s and it took %d seconds' % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) return best_matches
class TestFuzzyMatcher(unittest.TestCase): def setUp(self): with open('../bulk/cnpjs.txt') as f: self.fuzzy_set = f.read().splitlines() self.fuzzy_matcher = FuzzySet(self.fuzzy_set) def test_validate(self): self.assertEqual(self.fuzzy_matcher.get('06389497000195')[0][1], '04389697000195') self.assertEqual(self.fuzzy_matcher.get('15574828000190')[0][1], '15575829000190') self.assertEqual(self.fuzzy_matcher.get('15911974000144')[0][1], '15922975000144') self.assertEqual(self.fuzzy_matcher.get('12919223000129')[0][1], '12291923000129') self.assertEqual(self.fuzzy_matcher.get('557135900011')[0][1], '55713579000121') self.assertEqual(self.fuzzy_matcher.get('40194766000116')[0][1], '49794166000116') #'49794166000116' print self.fuzzy_matcher.get('40194766000116')[0][1]
def get_nutrition_data(image_class): url = "https://api.nal.usda.gov/ndb/search/?format=json&q=" + image_class + "&sort=n&max=25&offset=0&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" r = requests.get(url).json() max_dist_ratio = 0 ndbno = 0 for item in r["list"]["item"]: fs = FuzzySet() fs.add(image_class) ratio = fs.get(item["name"])[0][0] if ratio > max_dist_ratio: max_dist_ratio = ratio ndbno = item["ndbno"] print(ndbno) nutrition_url = "https://api.nal.usda.gov/ndb/V2/reports?ndbno=" + ndbno + "&type=f&format=json&api_key=FLKBoKOh7C1apAA4bPL0jH4GAW6f2wS9Lw0a2iFu" nutrition_data = requests.get(nutrition_url).json() nutrition_facts = {} nutrients = nutrition_data["foods"][0]["food"]["nutrients"] nutrition_facts["serve_size"] = str( nutrients[0]["measures"][0]["qty"]) + " ounces" nutrition_facts["kcal"] = str( nutrients[0]["measures"][0]["value"]) + " calories" nutrition_facts["fat"] = str( nutrients[2]["measures"][0]["value"]) + " grams" nutrition_facts["carbs"] = str( nutrients[3]["measures"][0]["value"]) + " grams" nutrition_facts["protein"] = str( nutrients[1]["measures"][0]["value"]) + " grams" nutrition_facts["sugar"] = str( nutrients[4]["measures"][0]["value"]) + " grams" nutrition_facts["sodium"] = str( nutrients[5]["measures"][0]["value"]) + " milligrams" return nutrition_facts
def _learn(self, command): functions = FuzzySet(self.function_names) rospy.logerr("No command found! Please input commands") self.function_names.append(command) self.functions[command] = [] commands = raw_input() sentences = json.loads( self._nlp.annotate(commands, properties=self._nlp_properties))['sentences'] for sentence in sentences: dependencies = [] for dependency in sentence['enhancedPlusPlusDependencies']: dependencies.append( (dependency['governorGloss'], dependency['dependentGloss'], dependency['dep'])) verb_tuple = [x for x in dependencies if x[2] == u'dobj'][0] query = functions.get(str(verb_tuple[0])) if query: query = query[0] func = self.functions.get(query[1]) self.functions[command].extend(func) rospy.logerr(self.functions)
class WordFixer: def __init__(self, word2vec: Word2VecKeyedVectors): self.__word2vec = word2vec self.__fixed_word_dict: Dict[str, str] = dict() self.__approximate_matcher = FuzzySet(word2vec.vocab) def is_word_correct(self, word: str): if word in self.__word2vec: return True return False def fix(self, word: str): if word in self.__fixed_word_dict: return self.__fixed_word_dict[word] candidate = self.__approximate_matcher.get(word) if candidate is not None and len(candidate) > 0: fixed_word = candidate[0][1] self.__fixed_word_dict[word] = fixed_word return fixed_word raise Exception("Cannot be fixed")
class SequentialFuzzyCnpjMatcher: """ Class that performs fuzzy string matching on CNPJs sequentially. For small fuzzyset this class is the easiest way to get started. However if you going for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher instead. """ def __init__(self): """ Default constructor :return: a SequentialFuzzyCnpjMatcher instance """ self.__cnpj_bases = [] for x in xrange(0, 100): idx = x * 1000000 self.__cnpj_bases.append('../bulk/cnpjs_base_' + str(idx).zfill(7) + '.txt') self.__fuzzy_matcher = None def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log('Searching for %s on %s' % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log( 'Best match for this file is %s and it took %d seconds' % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0] def __log(self, msg, debug=False): """ Prints a message to console depending on debug variable :param msg: a message string :param debug: a boolean value :return: """ if debug: print msg
class SequentialFuzzyCnpjMatcher: """ Class that performs fuzzy string matching on CNPJs sequentially. For small fuzzyset this class is the easiest way to get started. However if you going for a large fuzzyset we strongly recommend using LocalParallelFuzzyCnpjMatcher instead. """ def __init__(self): """ Default constructor :return: a SequentialFuzzyCnpjMatcher instance """ self.__cnpj_bases = [] for x in xrange(0, 100): idx = x * 1000000 self.__cnpj_bases.append("../bulk/cnpjs_base_" + str(idx).zfill(7) + ".txt") self.__fuzzy_matcher = None def match_cnpj(self, cnpj, debug=False): """ Search the closest valid CNPJ given a invalid one :param cnpj: a invalid CNPJ :param debug: whether you want to see debugging logs or not :return: a list of the most similar valid CNPJs to the one you've provided """ best_matches = [] for cnpj_base_str in self.__cnpj_bases: with open(cnpj_base_str) as f: # temp variables start_time = time.time() # Searching self.__log("Searching for %s on %s" % (cnpj, cnpj_base_str), debug) self.__fuzzy_matcher = FuzzySet(f.read().splitlines()) match = self.__fuzzy_matcher.get(cnpj) elapsed_time = time.time() - start_time self.__log("Best match for this file is %s and it took %d seconds" % (match, elapsed_time), debug) # Appending to the best matches so far if not match is None: for m in match: best_matches.append(m[1]) # Performing Fuzzy string match on the best results of each cnpj base file self.__fuzzy_matcher = FuzzySet(best_matches) return self.__fuzzy_matcher.get(cnpj)[0] def __log(self, msg, debug=False): """ Prints a message to console depending on debug variable :param msg: a message string :param debug: a boolean value :return: """ if debug: print msg
class TFIDFmatcher: def __init__(self, choices_corpus, ngram_range=(1, 2), use_cleaner=True, preprocess_func=None): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range self.use_cleaner = use_cleaner self.preprocess_func = preprocess_func self.initial_choices_corpus = choices_corpus if self.use_cleaner: choices_corpus = self.cleaner(choices_corpus) if self.preprocess_func: choices_corpus = [self.preprocess_func(k) for k in choices_corpus] self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] self.vocabulary = self.tfidf.vocabulary_.keys() self.fset_vocabulary = FuzzySet() for brnd in self.vocabulary: self.fset_vocabulary.add(brnd) def cleaner(self, x, verbose=False): if verbose: print("Before cleaning", type(x), x) def cleaning_function(x): return clean_string(x).lower() if type(x) == list: x = [cleaning_function(el) for el in x] if type(x) in [str]: x = cleaning_function(x) if verbose: print("After cleaning", type(x), x) return x def extract(self, query, choices=None, limit=5, verbose=False): """ :param choices should be a list of texts :param query: TODO add an input type checker :param processor: TODO : add a cleaning process :param scorer: TODO : Add other distances :return: """ # print("---------------------------\n" # Get rid of this case if choices == []: return [] if choices: choices = list(set(choices)) # Clean the choices corpus initial_choices = choices if self.use_cleaner: choices = self.cleaner(choices) if self.preprocess_func: choices = [self.preprocess_func(elk) for elk in choices] choices_corpus = choices corpus_tf_idf = self.tfidf.transform(choices_corpus) else: initial_choices = self.initial_choices_corpus choices_corpus = self.initial_choices_corpus corpus_tf_idf = self.initial_corpus_tf_idf # print("Defaulting" if self.use_cleaner: query = self.cleaner(query) if self.preprocess_func: query = self.preprocess_func(query) # building fuzzy query new_query = [] # print("Vocabulary", vocabulary) for q in query.split(): if q in self.vocabulary: new_query.append(q) else: fset_get = self.fset_vocabulary.get(q) if fset_get: tmp_score, new_q = fset_get[0] if verbose: print("Modified", q, new_q, tmp_score) if tmp_score >= 0.80: new_query.append(new_q) query = " ".join(new_query) if verbose: print("NEW QUERY", query) x = self.tfidf.transform([query]) cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten() related_docs_indices = cosine_similarities.argsort().flatten() if choices: result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices if choices_corpus[k]] else: result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices] result.sort(key=lambda tup: tup[1], reverse=True) # sorts in place # print("Query", query, "\nChoices", choices, "\nResult", result result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result] # print("Query", query, "\nChoices", choices, "\nResult", result if limit: return result[0:limit] return result def export_vocabulary(self, vocabulary_csv_destination, choices_corpus=None): if not choices_corpus: choices_corpus = self.initial_choices_corpus if self.use_cleaner: choices_corpus = [clean_string(x).lower() for x in choices_corpus] cnt_vec = CountVectorizer(ngram_range=self.ngram_range) transformed_data = cnt_vec.fit_transform(choices_corpus) l = [{'word':k, 'freq':v} for k, v in zip(cnt_vec.get_feature_names(), np.ravel(transformed_data.sum(axis=0)))] df = pd.DataFrame(l) df = df[['word', 'freq']] df.sort_values('freq', ascending=False, inplace=True) df.to_csv(vocabulary_csv_destination, encoding='utf-8', index=False, sep=";", doublequote=True, quoting=csv.QUOTE_ALL) print('The vocabulary was exported at : ', vocabulary_csv_destination)
'Eswatini': 'Swaziland', 'Timor-Leste': 'East Timor', 'Taiwan*': 'Taiwan', 'Tanzania': 'United Republic of Tanzania', 'US': 'United States of America', 'West Bank and Gaza': 'West Bank' } c19.rename(columns=c19partm, inplace=True) c19.drop(columns=c19notfound, inplace=True) # if nothing is printed by this loop, # then every country in c19 matches a country in geo data for c in c19.columns.tolist(): if c not in geoctrs: print(c, fzs.get(c)) # In[9]: # let's fix country names in population data fzs = FuzzySet() for c in popsctrs: fzs.add(c) popnotfound = ['Kosovo', 'West Bank'] poppartm = { 'Bahamas': 'The Bahamas', 'Brunei ': 'Brunei', 'DR Congo': 'Democratic Republic of the Congo',
class MiniBaseIndex(object): def __init__(self, field=None, tokenizer=None, similarity=None, base=None, idf_limit=0.05, **kw): super(MiniBaseIndex, self).__init__(**kw) self.content = {} self.field = field self.tokenizer = tokenizer self.similarity = similarity self.base = base self.counts = {} self.fuzzwords = FuzzySet(rel_sim_cutoff=0.7, use_levenshtein=False) self.blacklist = set() self.idf_limit = idf_limit def add(self, tok, i): if tok not in self.content: if tok not in self.blacklist: self.content[tok] = set() self.counts[tok] = 0 self.content[tok].add(i) self.counts[tok] += 1 # if self.counts[tok]/len(self.base.entries) > self.idf_limit: # self.blacklist.add(tok) # del self.counts[tok] # del self.content[tok] self.fuzzwords.add(tok) def finalize(self): for tok in self.content: pass # self.fuzzwords.add(tok) def search(self, x, expl=5000, top=25, maxtok=250, debug=False): tokenizer = self.tokenizer xtoks = tokenizer(x) # maxtok = maxtok * len(xtoks) results = {} # collect all toks alltoks = [] alltoks_set = set() for xtok in xtoks: for xtok_fuzz_score, xtok_fuzz_tok \ in self.fuzzwords.get(xtok): xtok_fuzz_sim = self.similarity(xtok, xtok_fuzz_tok) if xtok_fuzz_tok not in alltoks_set: alltoks.append( (xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim)) alltoks_set.add(xtok_fuzz_tok) # alltoks = list(alltoks) # sort together by fuzziness alltoks = sorted(alltoks, key=lambda x: x[2] * 100 + 1 / self.counts[x[1]], reverse=True) # take maxtok only if debug: print(len(alltoks), maxtok) for tok in alltoks: print(tok, self.counts[tok[1]]) alltoks = alltoks[:maxtok] # sort by inverse frequency # alltoks = sorted(alltoks, key=lambda x: self.counts[x[1]]) # alltoksset = set(alltoks) for xtok_fuzz_score, xtok_fuzz_tok, xtok_fuzz_sim in alltoks: for _id in self.content[xtok_fuzz_tok]: if _id not in results: results[_id] = 0 results[_id] += xtok_fuzz_score if len(results) > expl: break if len(results) > expl: break if debug: print(len(results)) results = [(res[0], res[1], self.similarity(x, self.base.entries[res[0]][self.field])) for res in results.items()] def sortkey(x): entid = x[0] pop = self.base.entries[entid]["pop"] sim = x[2] return sim * 1e2 + pop * 1e-3 results = sorted(results, key=sortkey, reverse=True) results = results[:top] return results
class BrandMatcher: def __init__(self, ngram_range=(1, 3)): """ :param choices_corpus: should be a list of texts :param preprocess_func: is a str->str function """ self.ngram_range = ngram_range choices_corpus = [str(x) for x in list(brands['brnd'].dropna().unique())] l = brands[['brnd', 'equivalents']].dropna().to_dict('records') self.equivalents = {} for el in l: for eq in el['equivalents'].split(';'): self.equivalents[eq.strip()] = el['brnd'] choices_corpus.extend(self.equivalents.keys()) self.initial_choices_corpus = choices_corpus self.cleaned_choices_corpus = self.cleaner(choices_corpus) self.tfidf = TfidfVectorizer(analyzer='word', sublinear_tf=True, # strip_accents='ascii', lowercase=True, ngram_range=self.ngram_range, min_df=0).fit(self.cleaned_choices_corpus) self.initial_corpus_tf_idf = self.tfidf.transform(choices_corpus) self.initial_corpus_tf_idf_dict = {} for k in range(len(choices_corpus)): self.initial_corpus_tf_idf_dict[choices_corpus[k]] = self.initial_corpus_tf_idf[k] # Creating fuzzy set self.fset_brands = FuzzySet() for token in [str(x) for x in list(brands['brnd'].dropna().unique())]: self.fset_brands.add(token) self.fset_tokens = FuzzySet() for token in list(self.tfidf.vocabulary_): self.fset_tokens.add(token) # Prepare the japanese matching jp_brands = brands[['brnd', 'brnd_jp_clean']] jp_brands = jp_brands[jp_brands.brnd_jp_clean.notnull()] jp_brands['brnd_jp_clean'] = jp_brands['brnd_jp_clean'].apply(lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', ''))) jp_brands['brnd_jp_size'] = jp_brands['brnd_jp_clean'].apply(lambda x: len(x)) jp_brands.sort_values(['brnd_jp_size', 'brnd'], ascending=[False, False], inplace=True) self.jp_brands = jp_brands # jp_brands.to_excel('/tmp/jp_brands.xlsx') def cleaner(self, x, verbose=False): if verbose: print("Before cleaning", type(x), x) def cleaning_function(x): return clean_string(x).lower() if type(x) == list: x = [cleaning_function(str(el)) for el in x] if type(x) in [str]: x = cleaning_function(x) if verbose: print("After cleaning", type(x), x) return x def extract(self, query, verbose=False): """ :param choices should be a list of texts :param query: TODO add an input type checker :param processor: TODO : add a cleaning process :param scorer: TODO : Add other distances :return: """ initial_choices = self.initial_choices_corpus choices_corpus = self.initial_choices_corpus corpus_tf_idf = self.initial_corpus_tf_idf query = self.cleaner(query) # building fuzzy query new_query = [] for q in query.split(): if verbose: print(q) fset_get = self.fset_tokens.get(q) if fset_get: tmp_score, new_q = fset_get[0] if verbose: print("Modified", q, new_q, tmp_score) if tmp_score > 0.80: new_query.append(new_q) query = " ".join(new_query) if verbose: print("NEW QUERY", query) x = self.tfidf.transform([query]) cosine_similarities = linear_kernel(x, corpus_tf_idf).flatten() related_docs_indices = cosine_similarities.argsort().flatten() result = [(choices_corpus[k], cosine_similarities[k].flatten()[0]) for k in related_docs_indices] result = [(initial_choices[choices_corpus.index(k[0])], k[1]) for k in result] # correcting with fuzzyratio score between result and query # result = [(k[0], k[1] * 0.01 * 0.5 * (fuzz.token_set_ratio(k[0], query) + fuzz.ratio(k[0], query))) for k in result] # result = [(k[0], k[1]) for k in result] result.sort(key=lambda tup: tup[1], reverse=True) # sorts in place if verbose: print("Query", query, "\nResult", result) max_score = max(result, key=itemgetter(1))[1] result = [k for k in result if k[1] == max_score] return result def find_brand(self, pdct_name_on_eretailer, special_country=None, verbose=False): if not pdct_name_on_eretailer: return {'brand': None, 'score': 0} assert special_country in ['JP', None] if bool(pattern_japanese_chinese_caracters.search(pdct_name_on_eretailer)) or special_country == 'JP': clean_jp_str = lambda x: unicodedata.normalize('NFKC', x.replace('・', '').replace(' ', '').replace('・', '')) clean_jp_name = clean_jp_str(pdct_name_on_eretailer) # Forbidden words: japanese_forbidden_words = [" shoulder ", ' bag ', '【CD】', "【SHM-CD】", 'dvd', 'helmet', 'rucksack', 'daypack', 'daiken', 'ダイケン', "スリープスパ", 'リンゴビール', 'パターソン', 'ヘネシー澄子', ] clean_japanese_forbidden_words = [clean_jp_str(x).lower() for x in japanese_forbidden_words] # print(clean_jp_name, clean_japanese_forbidden_words) if any(x in clean_jp_name.lower() for x in clean_japanese_forbidden_words): return {'brand': None, 'score': 0} for br in self.jp_brands.to_dict(orient='records'): if br['brnd_jp_clean'] in clean_jp_name: # print("clean_jp_name :", clean_jp_name, "candidate", br['brnd_jp_clean']) return {'brand': br['brnd'], "score": 98.765} if "モエ " in pdct_name_on_eretailer and any(x in clean_jp_name for x in ["750", 'ml', 'cl']): return {'brand': "Moët & Chandon", "score": 98.765} # Ad-hoc rules if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]) and 'dom p' in pdct_name_on_eretailer.lower(): return {'brand': 'Dom Pérignon', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["moet ", "moët"]]): return {'brand': 'Moët & Chandon', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["clicquot"]]): return {'brand': 'Veuve Clicquot', 'score': 99} if any([x in pdct_name_on_eretailer.lower() for x in ["ruinart"]]): return {'brand': 'Ruinart', 'score': 99} # # Forbidden words: # forbidden_words = ['leinwand', "hamper ", ' hamper', ' poster', 'poster ', 'chocolates ', ' chocolates', # 'truffle ', ' truffle', 'birthday cake', ' cake', 'candle', 'poplin', ' sheet ', ' bed ', # ' cover ', ' kimono', 'towel', 'dvd'] # if any(x in pdct_name_on_eretailer.lower() for x in forbidden_words): # return {'brand': None, 'score': 0} # Cleaning pdct_name_on_eretailer = pdct_name_on_eretailer.replace('–', ' ') pdct_name_on_eretailer = pdct_name_on_eretailer.replace('-', ' ') pdct_name_on_eretailer = pdct_name_on_eretailer.replace('_', ' ') pdct_name_on_eretailer = ' '.join(w for w in pdct_name_on_eretailer.split() if w) pdct_name_on_eretailer = pdct_name_on_eretailer.replace("'", "").replace('é', 'e').replace('Â', '').replace( 'ë', 'e') # print(pdct_name_on_eretailer) candidates = self.extract(pdct_name_on_eretailer, verbose=verbose) if not candidates: return {'brand': None, 'score': 0} # print(candidates) # print("FIRST SCORE :", brand, score) # Post treatment clean_tokens = clean_string(pdct_name_on_eretailer).split() # s = FuzzySet() # s.add(candidate) # l = [deepcopy(s.get(ngram, candidate)) for ngram in ngrams] # l = [x[0][0] for x in l if type(x) == list] brand, score = candidates[0], 0 for candidate in candidates: candidate_str = self.cleaner(candidate[0]) candidate_str = " ".join(candidate_str.split()[:9]) nb_token_candidate = len(candidate_str.split()) ngrams = [" ".join(clean_tokens[start:start + length]) for start in range(len(clean_tokens)) for length in range(max(nb_token_candidate, min(4, len(clean_tokens) - start + 1)))] # print([("'" + ngram + "'", "'" + candidate + "'", fuzz.ratio(ngram, candidate)) for ngram in ngrams]) l = [fuzz.ratio(ngram, candidate_str) for ngram in list(set(ngrams))] max_score = (max(l + [0])*0.01) ** 2 if max_score > score: score = max_score brand = candidate[0] if brand in self.equivalents: brand = self.equivalents[brand] score = round(100 * score, 2) # print("SECOND SCORE :", brand, score) # Forbidden words if any([x in pdct_name_on_eretailer.lower() for x in ["poster", 'dvd']]): return {'brand': None, 'score': 0} if score >= 80: if brand in ['Mercier']: # Add Krug ??? if 'hampagne' in pdct_name_on_eretailer.lower(): return {'brand': brand, 'score': score} if brand in ["Krug"] and any([x.lower() in pdct_name_on_eretailer.lower() for x in ['butler']]): return {'brand': None, 'score': 0} elif brand == "Belvedere": if not any([x in pdct_name_on_eretailer.lower() for x in ['zinfandel', 'chardonnay', 'sauvignon', 'pinot', 'merlot', 'syrah']]): return {'brand': brand, 'score': score} else: return {'brand': brand, 'score': score} elif verbose: print("Score is too low for: ", pdct_name_on_eretailer, {'brand': brand, 'score': score}) return {'brand': None, 'score': 0}
class MovedBlocksDetector(object): def __init__(self, removed_lines_dicts, added_lines_dicts): self.removed_lines = [] self.trim_text_to_array_of_added_lines = defaultdict(list) self.added_file_name_to_line_no_to_line = defaultdict(dict) self.removed_file_name_to_line_no_to_line = defaultdict(dict) self.added_lines_fuzzy_set = FuzzySet() for added_line_dict in added_lines_dicts: line = Line.from_dict(added_line_dict) self.trim_text_to_array_of_added_lines[line.trim_text].append(line) self.added_lines_fuzzy_set.add(line.trim_text) self.added_file_name_to_line_no_to_line[line.file][line.line_no] = line for removed_line_dict in removed_lines_dicts: line = Line.from_dict(removed_line_dict) self.removed_lines.append(line) self.removed_file_name_to_line_no_to_line[line.file][line.line_no] = line @staticmethod def from_diff(diff_text): parsed = diff_to_added_and_removed_lines(diff_text) return MovedBlocksDetector(parsed['removed_lines'], parsed['added_lines']) @measure_fun_time() def filter_out_block_inside_other_blocks(self, filtered_blocks: List[MatchingBlock]): filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_remove()) last_matching_block = None for matching_block in filtered_blocks: if last_matching_block is None: last_matching_block = matching_block continue if matching_block.last_removed_line.file == last_matching_block.last_removed_line.file \ and matching_block.first_removed_line.line_no >= last_matching_block.first_removed_line.line_no \ and matching_block.last_removed_line.line_no <= last_matching_block.last_removed_line.line_no: if matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\ and matching_block.removed_lines_numbers.issubset(last_matching_block.removed_lines_numbers): matching_block.remove_part_is_inside_larger_block = True else: last_matching_block = matching_block filtered_blocks.sort(key=lambda fb: fb.get_filter_sort_tuple_for_add()) ok_blocks = [] last_matching_block = None for matching_block in filtered_blocks: if getattr(matching_block, "remove_part_is_inside_larger_block", False): # TODO getattr was used to act like in javascript - rewrite it without getattr continue if last_matching_block is None: last_matching_block = matching_block ok_blocks.append(matching_block) continue if matching_block.last_added_line.file == last_matching_block.last_added_line.file \ and matching_block.first_added_line.line_no >= last_matching_block.first_added_line.line_no \ and matching_block.last_added_line.line_no <= last_matching_block.last_added_line.line_no\ and matching_block.weighted_lines_count < last_matching_block.weighted_lines_count\ and not matching_block.added_lines_numbers.issubset(last_matching_block.added_lines_numbers): pass else: last_matching_block = matching_block ok_blocks.append(matching_block) return ok_blocks def _filter_out_small_blocks(self, matching_blocks, min_lines_count): return [block for block in matching_blocks if block.weighted_lines_count >= min_lines_count and block.char_count >= 20] def _clear_not_matching_lines_at_end_and_filter_out_empty_blocks(self, matching_blocks): filtered_blocks = [] for matching_block in matching_blocks: block_without_empty_end = matching_block.clear_empty_lines_at_end() if block_without_empty_end is not None: filtered_blocks.append(matching_block) return filtered_blocks def merge_blocks(self, block1, block2): new_block = MatchingBlock() new_block.lines.extend(block1.lines) new_block.lines.extend(block2.lines) # TODO what about lines between those 2 blocks? new_block.first_added_line = block1.first_added_line or block2.first_added_line new_block.first_removed_line = block1.first_removed_line or block2.first_removed_line new_block.last_added_line = block2.last_added_line or block1.last_added_line new_block.last_removed_line = block2.last_removed_line or block1.last_removed_line new_block.weighted_lines_count = block1.weighted_lines_count + block2.weighted_lines_count new_block.not_empty_lines = block1.not_empty_lines + block2.not_empty_lines new_block.char_count = block1.char_count + block2.char_count new_block.weighted_chars_count = block1.weighted_chars_count + block2.weighted_chars_count new_block.match_density = new_block.weighted_chars_count / new_block.char_count new_block.added_lines_numbers = block1.added_lines_numbers | block2.added_lines_numbers new_block.removed_lines_numbers = block1.removed_lines_numbers | block2.removed_lines_numbers return new_block @measure_fun_time() def join_nearby_blocks(self, matching_blocks: List[MatchingBlock], max_space_between=2): max_space_between += 1 # if we want to allow 2 lines between blocks difference between line numbers is 3 blocks_grouped_by_files: Dict[tuple, List[MatchingBlock]] = defaultdict(list) for block in matching_blocks: blocks_grouped_by_files[(block.file_removed, block.file_added)].append(block) blocks_after_merge: List[MatchingBlock] = [] merged_blocks = 0 for block_list in blocks_grouped_by_files.values(): loops_made = 0 block_list.sort(key=lambda block: (block.first_removed_line.line_no, -block.match_density)) indexes_of_merged_blocks = set() merged_blocks_list = [] for i in range(len(block_list)): block = block_list[i] for j in range(i+1, len(block_list)): loops_made += 1 next_block = block_list[j] if next_block.first_removed_line.line_no - block.last_removed_line.line_no > max_space_between: break elif (next_block.first_removed_line.line_no > block.last_removed_line.line_no and next_block.first_added_line.line_no - block.last_added_line.line_no <= max_space_between and next_block.first_added_line.line_no > block.last_added_line.line_no): block = self.merge_blocks(block, next_block) merged_blocks += 1 indexes_of_merged_blocks.add(i) indexes_of_merged_blocks.add(j) if i in indexes_of_merged_blocks: merged_blocks_list.append(block) for i in range(len(block_list)): if i not in indexes_of_merged_blocks: blocks_after_merge.append(block_list[i]) blocks_after_merge.extend(merged_blocks_list) return blocks_after_merge @measure_fun_time() def filter_blocks(self, matching_blocks, min_lines_count=None): if min_lines_count is None: min_lines_count = 2 filtered_blocks = self._filter_out_small_blocks(matching_blocks, min_lines_count) filtered_blocks = self._clear_not_matching_lines_at_end_and_filter_out_empty_blocks(filtered_blocks) return self.filter_out_block_inside_other_blocks(filtered_blocks) def extend_matching_blocks_with_empty_added_lines_if_possible(self, currently_matching_blocks): for matching_block in currently_matching_blocks: while True: last_line = matching_block.last_added_line next_added_line = self.added_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1) if next_added_line and next_added_line.trim_text == '': matching_block.extend_with_empty_added_line(next_added_line) else: break def extend_matching_blocks_with_empty_removed_lines_if_possible(self, currently_matching_blocks: List[MatchingBlock]): extended_blocks = [] not_extended_blocks = [] for matching_block in currently_matching_blocks: last_line = matching_block.last_removed_line next_removed_line = self.removed_file_name_to_line_no_to_line[last_line.file].get(last_line.line_no + 1) if next_removed_line and next_removed_line.trim_text == '': matching_block.extend_with_empty_removed_line(next_removed_line) extended_blocks.append(matching_block) else: not_extended_blocks.append(matching_block) return extended_blocks, not_extended_blocks @measure_fun_time() def detect_moved_blocks(self, min_lines_count=None) -> List[MatchingBlock]: detected_blocks: List[MatchingBlock] = [] currently_matching_blocks = [] new_matching_blocks = [] for removed_line in self.removed_lines: if removed_line.trim_text: min_match_score = 0.5 if len(removed_line.trim_text) > 2 else 0.35 fuzzy_matching_pairs = self.added_lines_fuzzy_set.get( removed_line.trim_text, default=None, exact_match_only=False, min_match_score=min_match_score ) # iterate over currently_matching_blocks and try to extend them with empty lines self.extend_matching_blocks_with_empty_added_lines_if_possible(currently_matching_blocks) else: fuzzy_matching_pairs = [[1, '']] if not fuzzy_matching_pairs: continue for fuzz_pair in fuzzy_matching_pairs: match_probability, text = fuzz_pair added_lines = self.trim_text_to_array_of_added_lines[text] for added_line in added_lines: line_extended_any_block = False already_added = set() for i, matching_block in enumerate(currently_matching_blocks): if i in already_added: continue extended = matching_block.try_extend_with_line(removed_line, added_line, match_probability) if extended: new_matching_blocks.append(matching_block) line_extended_any_block = True already_added.add(i) if not line_extended_any_block and removed_line.trim_text != '': new_matching_blocks.append(MatchingBlock.from_line(removed_line, added_line, match_probability)) currently_matching_blocks = [matching_block for i, matching_block in enumerate(currently_matching_blocks) if i not in already_added] if removed_line.trim_text == '': extended_blocks, not_extended_blocks = \ self.extend_matching_blocks_with_empty_removed_lines_if_possible(currently_matching_blocks) new_matching_blocks.extend(extended_blocks) currently_matching_blocks = not_extended_blocks for matching_block in currently_matching_blocks: detected_blocks.append(matching_block) currently_matching_blocks = new_matching_blocks new_matching_blocks = [] for matching_block in currently_matching_blocks: detected_blocks.append(matching_block) detected_blocks = self.join_nearby_blocks(detected_blocks) filtered_blocks = self.filter_blocks(detected_blocks, min_lines_count) logger.info(f'Detected {len(filtered_blocks)} blocks ({len(detected_blocks) - len(filtered_blocks)} filtered)') return filtered_blocks