def calcRelativity(csv_file, keyWord, data): csv_file_tmp = csv_file.fillna("").astype(str) data = [x[1] for x in process.extractWithoutOrder(keyWord, data)] data = pd.DataFrame({ 'Relativity': data } ) return data
def getConfidence(self, string, stringSet): """Print each values of extractOne,extract,extractBests,extractWithoutOrder of fuzzywuzz process for visualizing the output""" try: print('extractone confidence: ', process.extractOne(string, stringSet)) print('extract confidence: ', process.extract(string, stringSet)) print('extractbests: ', process.extractBests(string, stringSet)) print('extractwithoutorder: ', process.extractWithoutOrder(string, stringSet)) except Exception as e: print('Error in getConfidence in StringHandling', e)
def match_tester2(test_user, list_db, name_param, doc_param, site_param, name_function): res_df = DataFrame() df1_ = DataFrame( process.extractWithoutOrder(str(test_user.iloc[0].name), list_db.name)) res_df['name_score'] = df1_[1] df2_ = DataFrame( process.extractWithoutOrder(str(test_user.iloc[0].user_doc_number), list_db.doc, scorer=fuzz.ratio)) res_df['doc_score'] = df2_[1] df3_ = DataFrame( process.extractWithoutOrder(str(test_user.iloc[0].country), list_db.country)) res_df['country_score'] = df3_[1] res_df['counter'] = list_db.counter res_df['final_score'] = (res_df.name_score * name_param * name_function( res_df.counter)) + (res_df.doc_score * doc_param) + (res_df.country_score * site_param) return res_df
def set_city_matches(self, cutoff): """Find fuzzy matches between city and cities in streets_table.""" if (not self.city) or (self.city == 'N/A') or ('No Match' in self.city): self.city_matches = [] else: valid_cities = self.streets_table['City'].unique() matches = process.extractWithoutOrder(self.city, valid_cities, scorer=fuzz.partial_ratio, score_cutoff=cutoff) self.city_matches = [city for city, score in matches]
async def qsearch(self, ctx, search_term: str): """ Use fuzzy search to allow users to search the mp3quran.net reciter list. """ reciter_list = await get_surah_reciters() reciters = [reciter.name for reciter in reciter_list] results = process.extractWithoutOrder(search_term, reciters, score_cutoff=65) formatted_results = '' i = 0 for result in results: i += 1 formatted_result = result[0].replace('-', ' - ').title().replace(' - ', '-') formatted_results = formatted_results + f'\n{i}. {formatted_result}' if formatted_results == '': await ctx.send('**No results.**') else: em = discord.Embed(title='Search Results', colour=0x006400, description=formatted_results) await ctx.send(embed=em)
def get_fuzzy_matches(search_term: str, term_counts: Dict[str, int], cutoff_score: int) -> List[Tuple[str, int]]: """ Return all terms that are similar to the search terms using the cutoff score as a threshold for the similarity. This expects a dictionary like the one returned from get_vocab_counts(), but can use any dictionary where the keys should be matched against the search terms. :param term_counts: Dictionary where the keys should be used to match the search term :param search_term: Term to use for comparing similarity of dictionary keys :param cutoff_score: Threshold for minimum similarity to possible match terms :return: All terms that meet the minimum similarity threshold """ return sorted(extractWithoutOrder(search_term, term_counts.keys(), score_cutoff=cutoff_score), key=lambda i: i[1], reverse=True)
def match_string(string, documents): """ Fuzzy matching of a string :param string: :param documents: :return: """ def custom_full_process(token, **kwargs): try: s = token.text except Exception as e: s = str(token) return full_process(s, **kwargs) corpus_tokens = [tok for doc in documents for tok in doc.tokens] match_generator = fuzzprocess.extractWithoutOrder( string, corpus_tokens, processor=custom_full_process, score_cutoff=95 ) return match_generator
def mash_list(self, buyers, short_threshold, long_threshold, short_size, stemming): match_list = [] for i, buyer in enumerate(buyers): if (buyer not in match_list): match_list.append(buyer) # excl_buyers is list of buyers excluding current check excl_buyers = buyers[:i] + buyers[i + 1:] # stem if stemming > 0 if (stemming > 0): excl_buyers = filter( lambda x: x.startswith(buyer[:stemming]), excl_buyers) # threshold depends on length of buyer name threshold = long_threshold if len( buyer) > short_size else short_threshold sl = process.extractWithoutOrder(buyer, excl_buyers, scorer=fuzz.token_sort_ratio, score_cutoff=threshold) matches = sorted(sl, key=lambda i: i[1], reverse=True) for match in matches: match_list.append(match[0]) print('\t'.join([buyer, match[0], str(match[1])]))
def calcRelativity(keyWord, data): data = [x[1] for x in process.extractWithoutOrder(keyWord, data)] data = pd.DataFrame({'Relativity': data}) return data
def find_matches(self, word, lst): generator = process.extractWithoutOrder( word, lst, score_cutoff=self.confidence_threshold) return [item[0] for item in generator]
def process_name_matching(df, desc): # Setup ############# chkoutid = None pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None if len(df.index) == 0: ################### ### Situation 1 ### -- No candidate checkouts ################### chkoutid = u'Amount / Bank wrong' pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None else: df_exact = df[df[u'name_clean'].map(lambda x: x in desc if x != u'' and (len(x)/len(desc)) > exact_name_match__min_ratio else False)] if len(df_exact.index) == 0: ################### ### Situation 2 ### -- No exact match ################### # Get 4 ratios for candidate checkouts score1 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.ratio)) df[u'simple_ratio'] = pd.Series([s[1] for s in score1]) score2 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.partial_ratio)) df[u'partial_ratio'] = pd.Series([s[1] for s in score2]) score3 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.token_sort_ratio)) df[u'sort_ratio'] = pd.Series([s[1] for s in score3]) score4 = list(process.extractWithoutOrder(desc, df[u'name_clean'].tolist(), scorer=fuzz.token_set_ratio)) df[u'set_ratio'] = pd.Series([s[1] for s in score4]) # Get max score df[u'max_ratio'] = df[[u'simple_ratio',u'partial_ratio',u'sort_ratio',u'set_ratio']].max(axis=1) # Get max of max df_approx = df[(df[u'max_ratio']>max_ratio_cutoff) & (df[u'simple_ratio']>simple_ratio_cutoff)] best_ratio = df_approx[u'max_ratio'].max() df_best = df_approx[df_approx[u'max_ratio'] == best_ratio] if len(df_best.index) == 0: ##################### ### Situation 2.1 ### -- No exact match + No suitable candidates ##################### chkoutid = None pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None else: df_best.drop_duplicates(subset=u'checkout_id', inplace=True) if len(df_best.index) == 1: ##################### ### Situation 2.2 ### -- No exact match + 1 BEST found ##################### chkoutid = df_best[u'checkout_id'].item() pmatch_name = df_best[u'name_clean'].item() pmatch_time = df_best[u'date_of_transfer'].item() # proof upload time pmatch_max_score = df_best[u'max_ratio'].item() pmatch_simple_score = df_best[u'simple_ratio'].item() pmatch_partial_score = df_best[u'partial_ratio'].item() pmatch_sort_score = df_best[u'sort_ratio'].item() pmatch_set_score = df_best[u'set_ratio'].item() else: ##################### ### Situation 2.2 ### -- No exact match + MANY BEST found ##################### chkoutid = df_best[u'checkout_id'].tolist() pmatch_name = df_best[u'name_clean'].tolist() pmatch_time = df_best[u'date_of_transfer'].tolist() # proof upload time pmatch_max_score = df_best[u'max_ratio'].tolist() pmatch_simple_score = df_best[u'simple_ratio'].tolist() pmatch_partial_score = df_best[u'partial_ratio'].tolist() pmatch_sort_score = df_best[u'sort_ratio'].tolist() pmatch_set_score = df_best[u'set_ratio'].tolist() candidate_chkout = df[u'checkout_id'].tolist() # check subset of amount & bank candidate_names = df[u'name_clean'].tolist() p0_scores = df[u'simple_ratio'].tolist() p1_scores = df[u'partial_ratio'].tolist() p2_scores = df[u'sort_ratio'].tolist() p3_scores = df[u'set_ratio'].tolist() elif len(df_exact.index) == 1: ################### ### Situation 3 ### -- 1 exact match ################### chkoutid = df_exact[u'checkout_id'].item() # use subset of amount, bank & name pmatch_name = df_exact[u'name_clean'].item() pmatch_time = df_exact[u'date_of_transfer'].item() pmatch_max_score = u'[EXACT MATCH: ARE YOU SURE]' pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None, None, None else: ################### ### Situation 4 ### -- MANY exact match ################### chkoutid = u'Many names found' pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score = None, None, None, None, None, None, None candidate_chkout = df_exact[u'checkout_id'].tolist() # check subset of amount & bank candidate_names = df_exact[u'name_clean'].tolist() p0_scores, p1_scores, p2_scores, p3_scores = None, None, None, None return (chkoutid, pmatch_name, pmatch_time, pmatch_max_score, pmatch_simple_score, pmatch_partial_score, pmatch_sort_score, pmatch_set_score, candidate_chkout, candidate_names, p0_scores, p1_scores, p2_scores, p3_scores)
def get_annotations(self, doc: Doc) -> (pd.DataFrame, pd.DataFrame): """ get_annotations will find the given named entities of the dictionary in the doc text :param doc: :returns a tuple of dataframes, the first contains the annotations the second contains relations between annotations """ old_annotations = doc.get_annotations() doc_text = doc.get_text() # prelabel all words that occurs in the dictionary matches = pd.DataFrame( columns=[Annotation.BEGIN, Annotation.END, self.QUERY]) for match in self._regex.finditer(doc_text): matches = matches.append( { Annotation.BEGIN: match.span()[0], Annotation.END: match.span()[1], self.QUERY: match.group() }, ignore_index=True) # initialize the new annotations and relations table new_annotations = pd.DataFrame(columns=Annotation.COLS) new_relations = pd.DataFrame(columns=Relation.COLS) # get all sentences from the document sentences = old_annotations[old_annotations[Annotation.LAYER] == Layer.SENTENCE] # if no sentences are available stop here, because we want to label labels in the dictionary on sentence level if sentences.empty: raise Exception('No sentences available') # iterate thru the sentences of a document to search for entities in the dictionary for index, sentence in sentences.iterrows(): # get the beginning and the end of each sentence to search for prelabeled begin = sentence[Annotation.BEGIN] end = sentence[Annotation.END] # find all prelabeld words that are in the current sentence boundaries sentence_matches = matches[(matches[Annotation.BEGIN] >= begin) & (matches[Annotation.END] <= end)] matched_words_list = list(sentence_matches[self.QUERY]) matched_word_string = self.WHITESPACE.join(matched_words_list) # find all entries in the dictionary that have at most as many words as the number of prelabeled # words in the sentence filtered_data = self._data[ self._data[self.LENGTH] <= len(matched_words_list)] # create a dict to search in for the fuzzywuzzy library data_index = list(filtered_data.index) data_queries = list(filtered_data[self.QUERY]) # queries = dict(zip(data_index, data_queries)) fuzzy_matches = process.extractWithoutOrder( matched_word_string, queries, scorer=fuzz.token_set_ratio, score_cutoff=self._min_matching_score) # create a dict that holds all found entities in a sentence sentence_index = sentence_matches[Annotation.BEGIN].map( str) + ':' + sentence_matches[Annotation.END].map(str) sentence_queries = dict(zip(sentence_index, matched_words_list)) for span, score, idx in fuzzy_matches: words = span.split(self.WHITESPACE) # initialize the old index for assigning relations correctly old_index = 0 # iterate over all words in the match for word_index, word in enumerate(words): # for each word in the match find the corresponding word in the sentence word_match = process.extractOne(word, sentence_queries) # get the begin and end value of the key begin = int(word_match[2].split(self.SEP)[0]) end = int(word_match[2].split(self.SEP)[0]) # append new annotation new_annotations = new_annotations.append( { Annotation.BEGIN: begin, Annotation.END: end, Annotation.LAYER: self._layer_name, Annotation.FEATURE: filtered_data.loc[idx][Annotation.FEATURE], Annotation.FEATURE_VAL: filtered_data.loc[idx][Annotation.FEATURE_VAL] }, ignore_index=True) # set the current annotation id for the relation current_idx = max(list(new_annotations.index.values)) # if we have more than one word in the dictionary connect them via relations if len(words) > 1 and word_index > 0: new_relations = new_relations.append( { Relation.GOV_ID: old_index, Relation.DEP_ID: current_idx, Relation.LAYER: self._layer_name, Relation.BEGIN: begin, Relation.END: end, Relation.FEATURE: filtered_data.loc[idx][Annotation.FEATURE], Relation.FEATURE_VAL: filtered_data.loc[idx][Annotation.FEATURE_VAL] }, ignore_index=True) old_index = current_idx return new_annotations, new_relations
def find_potential_checkouts_v2(df_chkout, stmt_amt, stmt_bank, stmt_desc): # Definitions: ################ # _ab : subset of amt & bank # _abn: subset of amt, bank & exact name chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None, None # Step 1: # Filter potential checkouts by proof amount & bank ################ potential_chkouts_ab = df_chkout[ (df_chkout['proof_amount'] == stmt_amt) & (df_chkout['[A] script_bank_cat'] == stmt_bank)] # # Step 2: Further filter potential checkouts if proof cust name is in description if len(potential_chkouts_ab.index) == 0: # Situation 1: No Amt Bank match chkoutid = 'Amount / Bank wrong' pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None else: # Situation 2: Amt & Bank match, proceed to confirm using name (proof name ~ stmt desc) potential_chkouts_ab[ '[B] proof_cust_name_clean'] = potential_chkouts_ab[ '[B] proof_cust_name_clean'].fillna('').str.lower( ).str.replace( '\"', '') # (1) fills na with un-matchable name (2) cleans it potential_chkouts_abn = potential_chkouts_ab[potential_chkouts_ab[ '[B] proof_cust_name_clean'].map(lambda x: x in stmt_desc)] # if len(potential_chkouts_abn.index) == 1: # Situation 2a: Single match using Amt, Bank & exact Name chkoutid = potential_chkouts_abn['checkoutid'].item( ) # use subset of amount, bank & name pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None elif len(potential_chkouts_abn.index) == 0: ######################## ### WORK IN PROGRESS ### ######################## # Situation 2b: (amt & bank --> some candidates, no exact match with name --> 2 options: possibility of approx match / no match at all) chkout_candidates = potential_chkouts_ab['checkoutid'].tolist( ) # check subset of amount & bank pmax = process.extractOne( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_set_ratio, score_cutoff=50) if pmax is None: pmax_name, pmax_score = None, None else: pmax_name = pmax[0] pmax_score = pmax[1] try: chkoutid = potential_chkouts_ab[ potential_chkouts_ab['[B] proof_cust_name_clean'] == str(pmax_name)]['checkoutid'].item() except ValueError: chkoutid = None p0 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist()) ) p0_names = [x[0] for x in p0] p0_scores = [x[1] for x in p0] p1 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_sort_ratio) ) # Note: this is using token_sort_ratio p1_names = [x[0] for x in p1] p1_scores = [x[1] for x in p1] p2 = list( process.extractWithoutOrder( stmt_desc, potential_chkouts_ab['[B] proof_cust_name_clean'].tolist(), scorer=fuzz.token_set_ratio) ) # Note: this is using token_set_ratio p2_names = [x[0] for x in p2] p2_scores = [x[1] for x in p2] ######################## ### WORK IN PROGRESS ### ######################## else: chkoutid = 'Many names found' pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores = None, None, None, None, None, None, None, None, None # return (chkoutid, pmax_name, pmax_score, chkout_candidates, p0_names, p0_scores, p1_names, p1_scores, p2_names, p2_scores)