def get_similarities(string, strings, method='jaro_winkler', similarity_function=None, case_sensitivity=1.0, first_char_weight=0.0, first_word_weight=0.0, echo=0): """ :type string: str :type strings: list[str] :type treat_as_sentence: bool :param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein' :type case_sensitivity: float :type first_char_weight: float :type first_word_weight: float :rtype: """ echo = max(0, echo) string = str(string) text = string + ' ? ' return list( ProgressBar.map(function=lambda x: get_similarity( s1=string, s2=x, method=method, similarity_function=similarity_function, first_char_weight=first_char_weight, case_sensitivity=case_sensitivity, first_word_weight=first_word_weight), iterable=strings, iterable_text=strings, text=text, echo=echo))
def find_most_similar(strings, candidates, candidate_ids=None, string_ids=None, method='jaro_winkler', similarity_function=None, case_sensitivity=1.0, first_char_weight=0.0, first_word_weight=0.0, num_results=1, echo=0): """ :type strings: str or list[str] :type candidates: list[str] :type candidate_ids: list or NoneType :type string_ids: list or NoneType :param str method: can be one of 'jaro_winkler', 'levenshtein', 'sentence_jaro_winkler', 'sentence_levenshtein' :type treat_as_sentence: bool :type first_char_weight: float :type first_word_weight: float :type case_sensitivity: float :rtype: pd.DataFrame """ echo = max(0, echo) if string_ids is None: string_ids = range(len(strings)) return pd.concat( list( ProgressBar.map( function=lambda x: find_most_similar_for_one_string( string=x[0], string_id=x[1], candidates=candidates, candidate_ids=candidate_ids, method=method, similarity_function=similarity_function, case_sensitivity=case_sensitivity, first_char_weight=first_char_weight, first_word_weight=first_word_weight, num_results=num_results, echo=echo - 1), iterable=list(zip(strings, string_ids)), iterable_text=strings, echo=echo))).reset_index(drop=True)[[ 'string_id', 'string', 'candidate_id', 'candidate', 'similarity_rank', 'similarity' ]]