def match_indication(indication, hpo_dict, fuzzy=False, min_similarity=0.85, min_fuzzy_len=10): """ Match a single indication to HPO dictionary, and return a nonredundant list of all matching terms and superterms """ # First check if indication is non-informative skip_terms = 'NO_CLINICAL_INFORMATION_PROVIDED UNKNOWN NOT_INDICATED ' + \ 'NOT_SPECIFIED NOT_PROVIDED NONE_PROVIDED' if indication in skip_terms.split(): return [] # Otherwise, try to match into HPO query = clean_pheno(indication) if fuzzy: raw_matches = [hp for key, hp in hpo_dict.items() \ if fuzz.token_sort_ratio(query, key) / 100 >= min_similarity] else: raw_matches = [hp for key, hp in hpo_dict.items() if query == key] raw_matches = set([hp for sublist in raw_matches for hp in sublist]) matches = list(raw_matches) return matches
def fast_score(self, target_place: Loc, result_place: Loc) -> float: # Get a rough, fast score for similarity between target and result. O is best. 100 is worst result_title = result_place.get_five_part_title() target_title = target_place.get_five_part_title() #self.logger.debug(f'F Score Result [{result_title}] targ [{target_title}] ') sc = 100 - fuzz.token_sort_ratio(result_title, target_title) #self.logger.debug(f'F Score={sc:.2f} Result [{result_title}] targ [{target_title}] ') return sc
def fuzzy_string_match(str_ref, str_hyp): """Returns fuzzy string similarity score in range [0.0, 1.0]. Args: str_ref: reference string str_hyp: hypothesis string Returns: fuzzy string similarity """ # The higher the score, the higher the similarity between the two strings. return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0
def query_tex_string(self, tex_string_1, threshold=65): recommendations = [] for fc in self.formula_concepts: tex_strings = self.formula_concepts[fc]['TeXStrings'] for tex_string_2 in tex_strings: if fuzz.token_sort_ratio(tex_string_1, tex_string_2) >= threshold: recommendations.append({'name': fc}) break return recommendations[:10] """formula_concept_names = list(formula_concepts.keys())
def partial_match(x_fact, y_fact, x_index, y_index): if x_index == y_index: # dont compare same facts return None # compare facts ratio = fuzz.token_sort_ratio(x_fact[0], y_fact[0]) if ratio > 80: # facts are most likely a match return (x_fact, y_fact), (x_index, y_index), ratio # facts are most likely not a match, return none return None
def partial_string_based(str1, str2): """Performs a partial string match using the Jaro-Winkler distance algorithm. Args: str1: A string value to check. str2: A string value to check. Returns: float: Number between 0.0 and 1.0 depending on match criteria. """ from rapidfuzz import fuzz result = fuzz.token_sort_ratio(str1, str2) logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result) return result / 100.0
def text_in_note(note, query_string): """Performs fuzzy searching against note text. :param dict note: an ArchivesSpace note. :param str query_string: a string to match against. :returns: True if a match is found for `query_string`, False if no match is found. :rtype: bool """ CONFIDENCE_RATIO = 97 """int: Minimum confidence ratio to match against.""" note_content = get_note_text(note) ratio = fuzz.token_sort_ratio( " ".join([n.lower() for n in note_content]), query_string.lower(), score_cutoff=CONFIDENCE_RATIO) return bool(ratio)
def featurize(df): if len(df.columns)==3: df.columns=['a', 'b', 'target'] elif len(df.columns)==2: df.columns=['a', 'b'] else: df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' }) df['TM_A'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1) df['TM_B'] = df.apply(lambda row: re.sub( '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1) df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1) df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1) df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1) # Jellyfish levenshtein df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1) # Scale Levenshtein column scaler = MinMaxScaler() df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1)) # Jellyfish phoneme df['metaphone'] = df.apply( lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1) df['nysiis'] = df.apply( lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1) df['mtch_rtng_cdx'] = df.apply( lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1) df['pshp_soundex_first'] = df.apply( lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1) for i, algo in enumerate(algos): df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1) return df
def contains_match(content, search_string): """Returns True if user-provided note input matches the corresponding note within a given ratio (CONFIDENCE_RATIO).""" ratio = fuzz.token_sort_ratio(content.lower(), search_string.lower()) return True if ratio > CONFIDENCE_RATIO else False
def test_token_ratio(s1, s2): """ token_ratio should be max(token_sort_ratio, token_set_ratio) """ assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2), fuzz.token_set_ratio(s1, s2))
def testTokenSortRatio(self): self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a), 100)
def check_formulae(self, formula_string, annotations, threshold_string=65, threshold_identifers=1): def get_identifier_score(identifiers1, identifiers2): score_identifers = len( list(set(identifiers1).intersection(identifiers2))) return score_identifers results_string = [] results_identifiers = [] formula_dict = self.get_formulae_from_repo() identifiers = self.extract_identifiers_from_formula( annotations, formula_string) c = CustomMathEnvParser(formula_string) identifiers_from_wikidata_formula, _ = c.get_split_math_env() static_wikidata_handler_logger.info( 'identifiers_from_wikidata_formula: {}'.format( identifiers_from_wikidata_formula)) for formula_name in formula_dict: formula = formula_dict[formula_name] qid = formula['qid'] tex_string = formula['formula'] score_string = fuzz.token_sort_ratio(formula_string, tex_string) if score_string >= threshold_string: results_string.append(({ 'name': formula_name, 'qid': qid }, score_string)) formula_identifiers = formula['identifiers']['names'] formula_quantity_symbols = formula['identifiers']['strings'] #flawed logic """if len(formula_quantity_symbols+formula_identifiers) > len(identifiers_from_wikidata_formula): score_identifers = get_identifier_score(identifiers, formula_quantity_symbols+formula_identifiers) if formula_name == 'sphere': print('score identifiers if: {}'.format(score_identifers)) else: score_identifers = get_identifier_score(identifiers, identifiers_from_wikidata_formula) if formula_name == 'sphere': print('score identifiers else: {}'.format(score_identifers)) print(formula) print('formula_identifiers: {}'.format(formula_identifiers)) print('formula_quantity_symbols: {}'.format(formula_quantity_symbols)) print('identifiers: {}'.format(identifiers))""" score_identifers = get_identifier_score( identifiers, formula_quantity_symbols + formula_identifiers) """if formula_name == 'sphere': print('score identifiers else: {}'.format(score_identifers)) print(formula) print('formula_identifiers: {}'.format(formula_identifiers)) print('formula_quantity_symbols: {}'.format(formula_quantity_symbols)) print('identifiers: {}'.format(identifiers))""" if score_identifers >= threshold_identifers: results_identifiers.append(({ 'name': formula_name, 'qid': qid }, score_identifers)) if len(results_string) > 0: results_string = [ r[0] for r in sorted(results_string, key=itemgetter(1)) ] #.reverse() if len(results_identifiers) > 0: results_identifiers = [ r[0] for r in sorted(results_identifiers, key=itemgetter(1)) ] #.reverse() return list(reversed(results_string)), list( reversed(results_identifiers))
def _find_device(self, device, allowed_types, room=""): LOG.debug("device: {} allowed_types: {} room: {}".format( device, allowed_types, room)) filter_dict = {'genericDeviceType': allowed_types} # new search strategy: first check if there is a fit in specified room if room: room = self._normalize(self._clean_common_words(room)) # LOG.debug("normalized room: {}".format(room)) filter_dict['room'] = room device_candidates = self.fhem.get(room=self.allowed_devices_room, filters=filter_dict) if len(device_candidates) == 1: # TODO can we do anything if len(...) > 1 ? LOG.debug("perfect match") # we have a perfect match: # there is only one device of the allowed type in the room dc = device_candidates[0] best_device = { "id": dc['Name'], "dev_name": self._get_aliasname(dc), "state": dc['Readings']['state'], "best_score": 999 } return best_device # try again without filter on room if 'room' in filter_dict.keys(): LOG.debug("try again without filter on room") del filter_dict['room'] device_candidates = self.fhem.get(room=self.allowed_devices_room, filters=filter_dict) # LOG.debug(device_candidates) # require a score above 50% best_score = 50 best_device = None if device_candidates: for dc in device_candidates: # LOG.debug("==================================================") norm_name = self._normalize(dc['Name']) norm_name_list = norm_name.split(" ") # LOG.debug("norm_name_list = %s" % norm_name_list) dev_room = self._get_normalized_room_list(dc) for r in dev_room: if (r not in norm_name_list): norm_name += (" " + self._normalize(r)) # LOG.debug("dev_room: {}".format(dev_room)) # LOG.debug("norm_name = %s" % norm_name) alias = self._get_aliasname(dc) norm_alias = self._normalize(alias) try: if (norm_name != norm_alias) and ('alias' in dc['Attributes']): score = fuzz.token_sort_ratio(device, norm_alias) # add bonus if room name match if room and dev_room: score += self._get_bonus_for_room( room, dev_room[0]) if score > best_score: best_score = score best_device = { "id": dc['Name'], "dev_name": alias, "state": dc['Readings']['state'], "best_score": best_score } score = fuzz.token_sort_ratio(device, norm_name) # add bonus if room name match if room and dev_room: score += self._get_bonus_for_room(room, dev_room[0]) # LOG.debug("%s %s" % (norm_name, score)) if score > best_score: best_score = score best_device = { "id": dc['Name'], "dev_name": alias, "state": dc['Readings']['state'], "best_score": best_score } except KeyError: pass # print("KeyError") LOG.debug("best device = %s" % best_device) return best_device
def get_ratio(row): name = row['wordmark'] return fuzz.token_sort_ratio(clean_text, name)
def _query_matches(self, query: str, place: Place) -> bool: '''Test if query matches place.''' return (fuzz.partial_ratio(query, place.name.lower()) > 80 or any(fuzz.token_sort_ratio(query, type_) > 80 for type_ in place.types) )
def __score_result(tweet, search_criteria): score = fuzz.token_sort_ratio(tweet.text, search_criteria.content) return score
def fuzzy_string_match(str_ref, str_hyp): """Returns fuzzy string similarity score in range [0.0, 1.0].""" # The higher the score, the higher the similarity between the two strings. return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0
def prefit_compute(queried_movie_type, standard_query_data): standard_train_data = dict_data[queried_movie_type.lower()] if queried_movie_type.lower() in ["tv-show","korean drama"]: queried_movie_type = "series" else: queried_movie_type = 'movie' train_data = standard_train_data.copy() if queried_movie_type == 'movie': try: standard_query_data[8] = int(standard_query_data[8].split('–')[0]) except: pass train_data = train_data[train_data['imdbID'] != standard_query_data[0]] train_data_with_query = train_data.append(pd.Series(index=train_data.columns, data=standard_query_data),ignore_index=True) if queried_movie_type != "series": train_data_with_query.drop('BoxOffice',axis=1,inplace=True) column_unknown = ['directors','writers', 'actors', 'production', 'country', 'language','Plot','Rated','Type','Genre'] for c in column_unknown: try: train_data_with_query[c].fillna('Unknown',inplace=True) except: pass if queried_movie_type == 'movie': column_median = ['imdbRating', 'rottenTomatoRating','metacriticRating','duration','year'] else: column_median = ['imdbRating','duration', 'totalSeasons'] for c in column_median: try: train_data_with_query[c].fillna(train_data_with_query[c].median(),inplace=True) except Exception as err: raise err if queried_movie_type == 'movie': column_zero = ['oscarNominations', 'noOfAwards', 'noOfNominations','imdbVotes'] else: column_zero = ['noOfNominations','imdbVotes','noOfAwards'] for c in column_zero: try: train_data_with_query[c].fillna(0,inplace=True) except: pass if queried_movie_type == 'movie': column_mode = [] else: column_mode = ['year'] for c in column_mode: try: train_data_with_query[c].fillna(train_data_with_query[c].mode()[0],inplace=True) except Exception as err: raise err def eucld_dist(a,b): return (abs(a-b)) def extract_unique_nominations(x): try: for (value, nomination) in re.findall(r'(\d+?):([a-zA-Z\s]+);', x, re.IGNORECASE): list_special_awards.append(nomination) except: pass return x def extract_nom_val(x,nom): try: res = re.findall(fr'(\d+?):{nom}', x, re.IGNORECASE) if res: return int(res[0]) else: return 0 except: pass if queried_movie_type.lower() == 'series': list_special_awards = ['Golden Globe', 'Primetime Emmy'] # movies_data_am_show['specialNominations'].apply(extract_unique_nominations) list_special_awards = list(set(list_special_awards)) for spec_nom in list_special_awards: train_data_with_query[spec_nom] = train_data_with_query['specialNominations'].apply(lambda x: extract_nom_val(x,spec_nom)) column_zero.append(spec_nom) train_data_with_query[spec_nom].fillna(0,inplace=True) if queried_movie_type == 'series': train_data_with_query['year'] = train_data_with_query['year'].apply(lambda x: int(x.split('–')[0])) column_zero.append('year'); dict_cols_init = dict((j,i) for i,j in enumerate(train_data_with_query.columns)) train_data_with_query.iloc[-1,3] = np.float64(train_data_with_query.iloc[-1,3]) train_data_with_query['imdbRating']*=10 standard_query_data = train_data_with_query.iloc[-1,:] train_data_with_query['actor_fuzz'] = train_data_with_query['actors'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['actors']],x)) dict_cols_init['actor_fuzz'] = len(dict_cols_init.values()) column_median.append('actor_fuzz') train_data_with_query['language_fuzz'] = train_data_with_query['language'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['language']],x)) dict_cols_init['language_fuzz'] = len(dict_cols_init.values()) column_median.append('language_fuzz') train_data_with_query['rated_fuzz'] = train_data_with_query['Rated'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Rated']],x)) dict_cols_init['rated_fuzz'] = len(dict_cols_init.values()) column_median.append('rated_fuzz') train_data_with_query['type_fuzz'] = train_data_with_query['Type'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Type']],x)) dict_cols_init['type_fuzz'] = len(dict_cols_init.values()) column_median.append('type_fuzz') train_data_with_query['plot_fuzz'] = train_data_with_query['Plot'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Plot']],x)) dict_cols_init['plot_fuzz'] = len(dict_cols_init.values()) column_median.append('plot_fuzz') train_data_with_query['genre_fuzz'] = train_data_with_query['Genre'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Genre']],x)) dict_cols_init['genre_fuzz'] = len(dict_cols_init.values()) column_median.append('genre_fuzz') train_data_with_query['country_fuzz'] = train_data_with_query['country'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['country']],x)) dict_cols_init['country_fuzz'] = len(dict_cols_init.values()) column_median.append('country_fuzz') train_data_with_query['title_fuzz'] = train_data_with_query['title'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['title']],x)) dict_cols_init['title_fuzz'] = len(dict_cols_init.values()) column_median.append('title_fuzz') def scale_data(x, col): return ((x/train_data_with_query[col].max()) * 100) if queried_movie_type == 'series': train_data_with_query['plot_fuzz'] = train_data_with_query['plot_fuzz'].apply(lambda x: 0 if (x <70) else x) train_data_with_query['genre_fuzz'] = train_data_with_query['genre_fuzz'].apply(lambda x: 0 if (x <70) else x) train_data_with_query['rated_fuzz'] = train_data_with_query['rated_fuzz'].apply(lambda x: 0 if (x <100) else x) train_data_with_query['actor_fuzz'] = train_data_with_query['actor_fuzz'].apply(lambda x: 0 if (x <80) else x) train_data_with_query['title_fuzz'] = train_data_with_query['title_fuzz'].apply(lambda x: 0 if (x <80) else x) train_data_with_query['noOfAwards'] = train_data_with_query['noOfAwards'].apply(lambda x: scale_data(x, 'noOfAwards')) train_data_with_query['noOfNominations'] = train_data_with_query['noOfAwards'].apply(lambda x: scale_data(x, 'noOfNominations')) train_data_with_query['imdbVotes'] = train_data_with_query['imdbVotes'].apply(lambda x: scale_data(x, 'imdbVotes')) # train_data_with_query['Primetime Emmy'] = train_data_with_query['Primetime Emmy'].apply(lambda x: scale_data(x, 'Primetime Emmy')) # train_data_with_query['Golden Globe'] = train_data_with_query['Primetime Emmy'].apply(lambda x: scale_data(x, 'Golden Globe')) else: train_data_with_query['title_fuzz'] = train_data_with_query['title_fuzz'].apply(lambda x: 0 if (x <80) else x) train_data_with_query.sort_values(by="title_fuzz", ascending=False).head(10) if queried_movie_type == 'series': train_data_with_query['total_season_dist'] = train_data_with_query['totalSeasons'].apply(lambda x:eucld_dist(x, standard_query_data[dict_cols_init['totalSeasons']])) column_median.append('total_season_dist') column_genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'FilmNoir', 'GameShow', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'RealityTV', 'Romance', 'SciFi', 'Short', 'Sport', 'TalkShow', 'Thriller', 'War', 'Western'] def parse_genre(x): try: if int(x) in [0,1]: return int[x] else: return 0; except: return 0 for col in column_genres: train_data_with_query[col] = train_data_with_query[col].fillna(0) a = train_data_with_query.copy() scaler = StandardScaler() to_be_standardized_features = column_median+column_zero to_be_standardized_data = train_data_with_query[to_be_standardized_features] train_data_with_query_standardized = scaler.fit_transform(to_be_standardized_data) for id_, c in enumerate(to_be_standardized_features): train_data_with_query[c] = train_data_with_query_standardized[:,id_] dict_cols_final = dict((j,i) for i,j in enumerate(train_data_with_query.columns)) if queried_movie_type == 'movie': distance_template = [ [1, [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39], 'c','d1',-1], [2, [50,52,53,54,55,3,4,5,6,7,8,10,11,51], 'e','d1',1] ] else: distance_template = [ # [1, [*list(range(10,38))], 'c','d2',-1], [2, [54,55,52,50,8,9,4,3], 'e','d1',1] ] train_data_2d = train_data_with_query.iloc[:-1,:].values labels = train_data_with_query.columns train_data_movie_titles_1d = train_data_with_query.iloc[:-1,:]['title'].values query_data_1d = train_data_with_query.iloc[-1,:].values return [train_data_2d, query_data_1d,train_data_movie_titles_1d,distance_template, labels]