async def similarity(username: str, password: str): jw = JaroWinkler() res = jw.similarity(username, password) if res <= 0.6: return True else: return False
def compare_distance(self, other): ''' ''' jarowinkler = JaroWinkler() dist_1 = jarowinkler.similarity(str.lower(self.data['name']), str.lower(other.data['name'])) dist_2 = jarowinkler.similarity(str.lower(self.data['address']), str.lower(other.data['address'])) if dist_2 > 0.85: return dist_1 return 0
def jaro_winkler(keyword, domain): """Compute Jaro Winkler similarity Args: keyword: domain: Returns: jarowinkler.similarity: (float) between 0.0 and 1.0 """ jarowinkler = JaroWinkler() return jarowinkler.similarity(keyword, domain)
def check_for_text_match(self,str1,strlist): jarowinkler = JaroWinkler() similarities=[] for str2 in strlist: similarities.append(jarowinkler.similarity(str1, str2)) index_max = np.argmax(similarities) if index_max >= .70: return strlist[index_max] similarities=[] for str2 in strlist: similarities.append(jarowinkler.similarity(str1.lower(), str2.lower())) index_max = np.argmax(similarities) if index_max >= .70: return strlist[index_max] else: return None
def build_column(self, data): left_col, right_col, algo = (self.cfg.get(p) for p in ["left", "right", "algo"]) normalized = self.cfg.get("normalized", False) if algo == "levenshtein": if normalized: similarity = strsimpy.normalized_levenshtein.NormalizedLevenshtein( ) else: similarity = strsimpy.levenshtein.Levenshtein() elif algo == "damerau-leveneshtein": similarity = strsimpy.damerau.Damerau() if normalized: similarity = SimilarityNormalizeWrapper(similarity) elif algo == "jaro-winkler": similarity = JaroWinkler() elif algo == "jaccard": similarity = strsimpy.jaccard.Jaccard(int(self.cfg.get("k", 3))) if normalized: similarity = SimilarityNormalizeWrapper(similarity) distances = apply( data[[left_col, right_col]].fillna(""), lambda rec: similarity.distance(*rec), axis=1, ) return pd.Series(distances, index=data.index, name=self.name)
def similarity_function(similarity_measure): if similarity_measure == 'exact': return exact_similarity if similarity_measure == 'mlcs': return lambda s1, s2: 1 - MetricLCS().distance(s1, s2) elif similarity_measure == 'nlevs': return lambda s1, s2: 1 - NormalizedLevenshtein().distance(s1, s2) elif similarity_measure == 'jaro': return JaroWinkler().similarity else: raise ValueError('Invalid similarity measure.')
def create_or_update_matches(obscurities: List[str]) -> None: if not os.path.exists('jaro_sim.csv'): print('Running experiment similarities...') jaro = JaroWinkler().similarity obscurity_count: Dict[str, bool] = { x: obscurities.count(x) for x in obscurities } with open('jaro_sim.csv', 'a') as file_jaro_sim, \ open('jaro_sim_orphan.csv', 'a') as file_jaro_sim_orphan: jaro_csv_writer = csv.writer(file_jaro_sim) orphan_csv_writer = csv.writer(file_jaro_sim_orphan) fields: List[str] = [ 'Obscurity', 'Count', 'KGV1', 'Similarity1', 'KGV2', 'Similarity2', 'KGV3', 'Similarity3' ] jaro_csv_writer.writerow(fields) orphan_csv_writer.writerow(fields) current_known_good_values = get_known_good_values() for obscurity in set(sorted(obscurities)): sim_val: List[Tuple[str, float]] = [] for good_value in current_known_good_values: similarity_value: float = jaro(obscurity, good_value) sim_val.append((good_value, similarity_value)) sim_val.sort(reverse=True, key=operator.itemgetter(1)) topthree: List[Union[str, float, int]] = [] topthree.extend([x for y in sim_val[:3] for x in y]) output = [obscurity, obscurity_count[obscurity]] + topthree if obscurity_count[obscurity] <= 10: orphan_csv_writer.writerow(output) else: jaro_csv_writer.writerow(output) else: print('Simulation file already exists.') return
import csv import json from strsimpy.jaro_winkler import JaroWinkler COMPARE_MODE = ['csv', 'json', 'string'] METHOD = JaroWinkler() SPLITTERS = ',' EXCLUSIONS = '_- ' REPLACEMENTS = {'0': 'Oo'} THRESHOLD = 0.98 def clean_string(string, splitters=SPLITTERS, exclusions=EXCLUSIONS, replacements=REPLACEMENTS): s = string for splitter in splitters: s = s.split(splitter)[0] for exclusion in exclusions: s = s.replace(exclusion, '') for new, olds in replacements.items(): for old in olds: s = s.replace(old, new) s = s.lower() s = s.strip() return s def compare_string(s1, s2, method=METHOD, clean=True):
] # comment IDs of parent comments in WOdin mastery threads for wind and earth-weak wodinThreadIds = [ ['k8pd7q'], ['k8petf' ], # thread IDs for individual phys/mag weak threads for lighting-weak ['kj1gdp'], ['kj1fcw'], # water-weak ['lc3fe6'], ['lc3fey'] ] # fire-weak sbTypes = [ 'LBO', 'LBG', 'ADSB', 'SASB', 'AASB', 'GSB+', 'CSB', 'AOSB', 'USB', 'OSB', 'GSB', 'BSB', 'SSB', 'Unique' ] # cleanSbNames() maps to these heroNameList = getHeroNameList() strsim = JaroWinkler( ) # string similarity module for catching typos/abbreviations ## Run for Dreambreaker outputLines = [] # buffer to put output strings into summaryLines = ['#Summary table\n\n\n'] appendTableHeader(summaryLines, sbTypes) summaryLines[-2] = summaryLines[-2].replace('|Hero|Used', '|Realm') summaryLines[-1] = summaryLines[-1][4:] teamTableTextLines = [] for threadId in dbThreadIds: submission = reddit.submission(id=threadId[0]) threadTitle = submission.title realm = threadTitle[threadTitle.find("(") + 1:threadTitle.find(")")] print('\n*****************\n{}\n*****************\n'.format(threadTitle)) commentsList = [] postUrl = []
class STSWikiReader: """Reads data from website, creates a lookup map of item names, and does soft string matching to find possible mentions of the item parsed """ strcmp = JaroWinkler() def __init__(self, name, links, ignore_list, parse_names): self.last_update = datetime.datetime.utcnow() self.name = name self.links = links self.ignore_list = ignore_list self.parse_names = parse_names self.base_set = set() self.real_names = set() self.fake_name_map = dict() self.cur = None self.max_name_word_cnt = 0 self.max_match = 0 self.FORCE_IGNORE_NAME = '~~FORCE~IGNORE~~' self.update_info() def format_name(self, name): """Used to get a clean, uniform name with pesky characters removed""" return self._rm_double_space( self._rm_symbol(self._rm_squote(self._rm_hyph(name.lower())))) def _rm_symbol(self, name): """removes odd characters that should never be in a obj name""" return name.replace('?', ' ').replace(',', ' ').replace('.', ' ') \ .replace('!', ' ').replace('(', ' ').replace(')', ' ') \ .replace(':', ' ').replace('"', ' ').replace('+', ' ') \ .replace('[', ' ').replace(']', ' ') def _rm_squote(self, name): """removes single quotes""" return name.replace("'", '').replace('’', '') def _lower(self, name): """exists to pass along to alternative names func""" return name.lower() def _rm_hyph(self, name): """swaps typical joining characters with spaces""" return name.replace('-', ' ').replace('_', ' ') def _rm_beta(self, name): """removes beta tag (possible error from wiki)""" return name.replace('_beta', '').replace('_Beta', '') \ .replace('Beta', '').replace('beta', '') def _append_s(self, name): """makes things plural (simple method prone to error, but will do for now) """ return f'{name}s' def _rm_double_space(self, name): while ' ' in name: pos = name.find(' ') name = name[:pos] + name[pos + 1:] return name def _rm_article_at_start(self, name): articles = ['the', 'a', 'an'] test_name = name.lower() for article in articles: if test_name.startswith(article + ' '): return name[len(article) + 1:] return name def _gen_alternative_names(self, name): """creates a massive list of possible mistypes for a specific name, used as an aid for matching user input """ names = set() actions = [ self._rm_symbol, self._rm_squote, self._lower, self._rm_article_at_start, self._rm_hyph, self._rm_beta, self._append_s ] # Weird edge case for beta tag on wiki vs beta the card if name.lower().strip() == 'beta': actions.remove(self._rm_beta) for outer in range(len(actions)): temp_name = name for inner in range(len(actions) - outer): temp_name = self._rm_double_space(actions[outer + inner](temp_name)) names.add(temp_name) return list(names) def update_info(self): """goes to the web and finds information provided by the links""" log(f'Updating {self.name}s...') seen_list = set() # fetch data from links and update object with most recent info for link in self.links: res = requests.get(link, verify=False) for cur_name in self.parse_names( soup(res.text, features="html.parser")): if cur_name.lower() in self.ignore_list: continue seen_list.add(cur_name) # if we haven't seen it before, add it to our look up list. if (cur_name not in self.base_set) \ and (not cur_name.startswith('Category:')): self.base_set.add(cur_name) self.real_names.add(cur_name) self.fake_name_map[cur_name] = cur_name self.max_name_word_cnt = max(self.max_name_word_cnt, len(cur_name.split(' '))) for new_name in self._gen_alternative_names(cur_name): if new_name.strip(): self.base_set.add(new_name) self.fake_name_map[new_name] = cur_name # handle deleted data from wiki recalc_max_name_word_cnt = False for cur_name in self.real_names - seen_list: for new_name in self._gen_alternative_names(cur_name): self.base_set.remove(new_name) del self.fake_name_map[new_name] if not recalc_max_name_word_cnt \ and self.max_name_word_cnt == len(cur_name.split(' ')): recalc_max_name_word_cnt = True self.base_set.remove(cur_name) self.real_names.remove(cur_name) del self.fake_name_map[cur_name] if recalc_max_name_word_cnt: self.max_name_word_cnt = 0 for cur_name in self.real_names: self.max_name_word_cnt = max(self.max_name_word_cnt, len(cur_name.split(' '))) # finalize update self.last_update = datetime.datetime.utcnow() log(f'Found {len(self.real_names)} {self.name}s') def check_if_similar(self, name): """uses similarity check to see if the passed in name may match any of our found or generated names """ name = self.format_name(name) split_name = name.split(' ') word_thresh = 0.9**len(split_name) self.max_match = 0 self.cur = None for item_name in self.base_set: split_item_name = item_name.split(' ') if len(split_name) == len(split_item_name): word_check = 1 for i in range(len(split_name)): word_check *= self.strcmp.similarity( split_name[i], split_item_name[i]) word_check *= self.strcmp.similarity( split_name[i][::-1], split_item_name[i][::-1]) if word_check > self.max_match: self.max_match = word_check if word_check >= word_thresh: self.cur = self.fake_name_map[item_name] return self.cur is not None def check_if_exists(self, name, update=True): """Used to check if a name is a perfect match for any found names or is close enough to call a match """ if update and datetime.datetime.utcnow() - self.last_update \ > datetime.timedelta(days=15): self.update_info() if name.lower() in self.ignore_list: self.cur = self.FORCE_IGNORE_NAME self.max_match = 1 return True res = name in self.real_names if res: self.cur = name self.max_match = 1 elif name in self.fake_name_map.keys(): self.cur = self.fake_name_map[name] self.max_match = 1 res = True else: res = self.check_if_similar(name) return res
from strsimpy.jaro_winkler import JaroWinkler from strsimpy.ngram import NGram from strsimpy.qgram import QGram qgram = QGram(2) print(qgram.distance('ABCD', 'ABCE')) twogram = NGram(2) print(twogram.distance('ABCD', 'ABTUIO')) s1 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp' s2 = 'Adobe CreativeSuite 5 Master Collection from cheap d1x' fourgram = NGram(4) print(fourgram.distance(s1, s2)) jarowinkler = JaroWinkler() print(jarowinkler.similarity('My string', 'My tsring')) print(jarowinkler.similarity('My string', 'My ntrisg')) optimal_string_alignment = OptimalStringAlignment() print(optimal_string_alignment.distance('CA', 'ABC')) damerau = Damerau() print(damerau.distance('ABCDEF', 'ABDCEF')) print(damerau.distance('ABCDEF', 'BACDFE')) print(damerau.distance('ABCDEF', 'ABCDE')) print(damerau.distance('ABCDEF', 'BCDEF')) print(damerau.distance('ABCDEF', 'ABCGDEF')) print(damerau.distance('ABCDEF', 'POIU')) normalized_levenshtein = NormalizedLevenshtein()
from faktotum.typing import Entities, Pipeline, TaggedTokens from faktotum.utils import ( align_index, cosine_similarity, extract_features, get_best_candidate, group_mentions, pool_tokens, predict_labels, sentencize, vectorize_context, ) NER_MODELS = NamedEntityRecognition() NED_MODELS = NamedEntityDisambiguation() JARO_WINKLER = JaroWinkler() def nel(text: str, kb: KnowledgeBase, domain: str) -> TaggedTokens: """Named Entity Linking. Parameters ---------- text : str The text to process. kb : KnowledgeBase The knowledge base to link entities. domain : str Domain of the text, either `literary-texts` or `press-texts`. Returns
last_song = json.loads(f.read()) f.close() logger.write(' JSON: Last song was %s by %s\n' % (last_song[0], last_song[1])) ############### LIKING Song ####### if liked: network.get_track(artist, title).love() logger.write(' LOVE: Loved the Song on LastFm\n') else: network.get_track(artist, title).unlove() logger.write(' LOVE: Unloved the Song on LastFm\n') ############### String Compare #### jarowinkler = JaroWinkler() if last_song[ 0] != title: # Check, so that this program doesn't scrobble the song multiple times last_scrobble = network.get_user( lastFmCreds['username']).get_recent_tracks(limit=1) logger.write(' LastFM: Last song was %s by %s\n' % (last_scrobble[0][0].title, last_scrobble[0][0].artist)) if jarowinkler.similarity(str(last_scrobble[0][0].title.lower( )), title.lower()) < 0.9: # check that "nobody else" scrobbled the song unix_timestamp = int(time.mktime(datetime.datetime.now().timetuple())) if 'album' in locals(): network.scrobble(artist=artist, title=title,
f = open("akcigerhastaligi.txt", encoding="utf8") df = f.read() def basic_clean(text): wnl = nltk.stem.WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('turkish') words = re.sub(r'[^\w\s]', '', text).split() return [wnl.lemmatize(word) for word in words if word not in stopwords] words = basic_clean(df) unigrams = nltk.ngrams(words, 1) unigramsFrequency = Counter(unigrams) valuesOfUnigrams = list(unigramsFrequency.values()) unigramlist = list(unigramsFrequency) for x in range(0, len(unigramlist)): if (valuesOfUnigrams[x] > 4): print(unigramlist[x], "is used", valuesOfUnigrams[x], "times") jarowinkler = JaroWinkler() print(jarowinkler.similarity('öksürük', 'öksürk')) print(jarowinkler.similarity('akciğer', 'akciğr')) print(jarowinkler.similarity('kanser', 'akciğr')) print(jarowinkler.similarity('kanser', 'öksürk')) print(jarowinkler.similarity('akciğer', 'öksürk')) print('öksürk ' 'öksürük') print('akciğr ' 'akciğer') print('kansr ' 'kanser')
def string_distance(a, b): jarowinkler = JaroWinkler() return jarowinkler.similarity(a, b) + jaccard(a, b)
# from .ngram import NGram # from .normalized_levenshtein import NormalizedLevenshtein # from .optimal_string_alignment import OptimalStringAlignment # from .qgram import QGram # from .shingle_based import ShingleBased # from .sorensen_dice import SorensenDice # from .string_distance import StringDistance # from .string_similarity import StringSimilarity # from .weighted_levenshtein import WeightedLevenshtein # from .sift4 import SIFT4Options, SIFT4 cosine = Cosine(2) sorensenDice = SorensenDice(2) jaccard = Jaccard(2) qgram = QGram(2) jaroWinkler = JaroWinkler() normalizedLevenshtein = NormalizedLevenshtein() stringSimilarity = StringDistance() s0 = '烟台大学人文学院' s1 = '江西农业大学' # print(cosine.get_profile(s1)) # print(sorensenDice.get_profile(s1)) print(cosine.similarity(s0, s1)) print(sorensenDice.similarity(s0, s1)) print(jaccard.similarity(s0, s1)) print(jaroWinkler.similarity(s0, s1)) print(normalizedLevenshtein.similarity(s0, s1)) # print(qgram.distance(s0, s1)) similarity_list = [jaroWinkler, cosine, jaccard, normalizedLevenshtein] er_process_with_similarity(path_o1, path_o2, path_t,
if (title.find('(') > -1): strA = title[:title.find('(')] strB = title[title.find('(') + 1:title.find(',')] return [strA, strB] # stringsAB = [[[a, b] for a, b in getStringsAB(str(title))] for title in df['original_title'].to_list()] stringsAB = [getStringsAB(str(title)) for title in df['title'].to_list()] npStringsAB = np.array(stringsAB) # %% levenshtein = Levenshtein() normalized_levenshtein = NormalizedLevenshtein() damerau = Damerau() optimal_string_alignment = OptimalStringAlignment() jarowinkler = JaroWinkler() lcs = LongestCommonSubsequence() metric_lcs = MetricLCS() twogram = NGram(2) qgram = QGram(2) cosine = Cosine(2) strAs = npStringsAB[:, 0].tolist() strBs = npStringsAB[:, 1].tolist() results = { 'str A': strAs, 'str B': strBs, # 'Levenshtein': [ # levenshtein.distance(str1a, str1b),
csv_reader = csv.reader(file) lst_experiments = next(csv_reader) obscurities: List[str] = [] for obscurity in lst_experiments: if obscurity not in known_good_values and obscurity not in pynmrstar.definitions.NULL_VALUES: obscurities.append(obscurity) ##### obscurity_count: Dict[str, bool] = {x: obscurities.count(x) for x in obscurities} ##### jaro = JaroWinkler().similarity sift = SIFT4().distance threegram = NGram(3).distance mapping = {'jaro': jaro, 'sift': sift, 'threegram': threegram} for algorithm in mapping: print(f'Running {algorithm}...') with open(f'{algorithm}_sim.csv', 'w') as file_a, open(f'{algorithm}_sim_orphan.csv', 'w') as file_b: csv_writer_a = csv.writer(file_a) csv_writer_b = csv.writer(file_b) fields: List[str] = [ 'Obscurity', 'Count', 'KGV1', 'Similarity1', 'KGV2', 'Similarity2', 'KGV3', 'Similarity3'