def fuzzyMatching(control, full_names): """ :Summary: compare the full names in the full_names dictionary to the control full name, gather parcentage similarity in an array and return the average. :control: A control full name string against which other full names in full_names dict are compared. :full_names: A Dictionary containing a users full names as specified in the neccessary platforms :return: returns average parcentage similarity or zero if any of the usernames doesn't exist. """ _list = [] ave = 0 for key in full_names: if full_names[key] != username_does_not_exist: if full_names[key] != no_username_provided: _list.append( fuzz.WRatio(control, full_names[key], score_cutoff=60)) else: return 0 for each in _list: ave += each return ave / len(_list)
def keyword_in_search(search_item, keywords=()): """ whether `keyword` is present in `search_item` """ link = search_item['link'] title = search_item['title'] snippet = search_item['snippet'] for information in (link, title, snippet): if any( # normal search keyword.lower() in information.lower() for keyword in keywords): return True if any( # fuzzy search fuzz.WRatio(keyword, information, score_cutoff=90) for keyword in keywords): return True return False
def keyword_in_search(search_item, keywords=(), must_contain_all=False): """ whether `keyword` is present in `search_item` """ link = search_item['link'] title = search_item['title'] snippet = search_item['snippet'] filter_func = all if must_contain_all else any for information in (link, title, snippet): if filter_func( # normal search keyword.lower() in information.lower() for keyword in keywords ): return True if filter_func( # fuzzy search fuzz.WRatio(keyword, information, score_cutoff=90) for keyword in keywords ): return True return False
def testWRatioUnicodeString(self): s1 = "Á" s2 = "ABCD" score = fuzz.WRatio(s1, s2) self.assertEqual(0, score)
def testQRatioUnicode(self): self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
def testWRatioMisorderedMatch(self): # misordered full matches are scaled by .95 self.assertEqual(fuzz.WRatio(self.s4, self.s5), 95)
def testWRatioPartialMatch(self): # a partial match is scaled by .9 self.assertEqual(fuzz.WRatio(self.s1, self.s3), 90)
def testWRatioCaseInsensitive(self): self.assertEqual(fuzz.WRatio(self.s1, self.s2), 100)
def testWRatioEqual(self): self.assertEqual(fuzz.WRatio(self.s1, self.s1a), 100)
def search(query_str, n=3, fuzzy_weight=default_fuzzy_weight, text_weight=default_text_weight, image_weight=default_image_weight): print(query_str) if use_bert: query_embedding = model.encode(query_str) else: # import time # start_time = time.time() query_embedding = embed_text(query_str) # print("--- %s seconds ---" % (time.time() - start_time)) if use_bert: embeddings = normalized_sentence_embeddings scores, indices = query(query_embedding, embeddings, n) results1 = Counter({i: text_weight * scores[i] for i in indices}) # print(results1) if fuzzy_weight > 0: # results2 = process.extract(query_str, {i:x for i,x in enumerate(names)}, limit=n) results2 = process.extract(query_str, names, scorer=fuzz.WRatio, limit=n) results2 = Counter( {x[2]: (fuzzy_weight * x[1] / 100) for x in results2}) # print(results2) for key, value in list(results1.most_common()): results2[key] = fuzzy_weight * fuzz.WRatio( query_str, names[key]) / 100 for key, value in list(results2.most_common()): results1[key] = text_weight * scores[key] results = results1 + results2 return [key for key, value in results.most_common(n)] else: return [key for key, value in results1.most_common(n)] else: # embeddings = sentence_weight * sentence_embeddings + (1-sentence_weight) * image_embeddings # embeddings = sentence_weight * normalized_sentence_embeddings + (1-sentence_weight) * normalized_image_embeddings # import time # start_time = time.time() scores_text, indices_text = query(query_embedding, normalized_sentence_embeddings, n) # print("--- %s seconds ---" % (time.time() - start_time)) # start_time = time.time() scores_images, indices_images = query(query_embedding, normalized_image_embeddings, n) # print("--- %s seconds ---" % (time.time() - start_time)) # return_dict = manager.dict() # p = Process(target=queryParal, args=("text",query_embedding,normalized_sentence_embeddings,n, return_dict)) # p.start() # p2 = Process(target=queryParal, args=("images",query_embedding,normalized_image_embeddings,n, return_dict)) # p2.start() # p.join() # p2.join() # scores_text, indices_text = return_dict["text"] # scores_images, indices_images = return_dict["images"] results_text = Counter( {i: text_weight * scores_text[i] for i in indices_text}) results_images = Counter( {i: image_weight * scores_images[i] for i in indices_images}) # print(results1) if fuzzy_weight > 0: import time start_time = time.time() # results2 = process.extract(query_str, {i:x for i,x in enumerate(names)}, limit=n) # print(query_str) # print(type(names)) print(names[0]) results2 = process.extract(query_str, names, scorer=fuzz.WRatio, limit=n) # results2 = process.extract("hahatest", ["test","tost"], scorer=fuzz.WRatio, limit=1) print("--- %s seconds ---" % (time.time() - start_time)) results2 = Counter( {x[2]: (fuzzy_weight * x[1] / 100) for x in results2}) # print(results2) for key, value in list(results_text.most_common()): results2[key] = fuzzy_weight * fuzz.WRatio( query_str, names[key]) / 100 for key, value in list(results_images.most_common()): results2[key] = fuzzy_weight * fuzz.WRatio( query_str, names[key]) / 100 for key, value in list(results2.most_common()): results_text[key] = text_weight * scores_text[key] results_images[key] = image_weight * scores_images[key] results = results_text + results_images + results2 return [key for key, value in results.most_common(n)] else: return [key for key, value in results1.most_common(n)]