def is_chain(venue_id): vs = VenueSearcher() venue_data = vs.get_venue_json(venue_id) if vs.venue_has_chain_property(venue_data): return True global_venues = vs.global_search(venue_data['name']) global_similar_name_count = len(filter(lambda x: ratio(x, venue_data['name']) > 0.95, [venue['name'] for venue in global_venues])) if len(global_venues) > 1 and global_similar_name_count > 0: global_proportion = float(len(global_venues))/global_similar_name_count else: global_proportion = 0 local_venues = vs.local_search(venue_data, venue_data['name'], 5000) local_similar_name_count = len(filter(lambda x: ratio(x, venue_data['name']) > 0.95, [venue['name'] for venue in local_venues])) if len(local_venues) > 1 and local_similar_name_count > 0: local_proportion = float(len(local_venues))/local_similar_name_count else: local_proportion = 0 if global_proportion > 0.9 or local_proportion > 0.9: return True else: return False
def infer_function(question, new, old): # default map translations that need no manual confirmation( this should go to a separate file later) default_map = { 'Repeated collection(more than once)': 'Repeated collection (specify frequency and/or time interval) ', 'Subgroup analyzed (eg. Dementia)': 'Subgroup analyzed (eg. Dementia, please specify subgroup)' } try: if ratio(unicode(default_map[old]), unicode(new)) > 0.97: return True return False except KeyError: print "Not default mapping, manual input required" input = None # Ignore low scores automatically if ratio(old, new) < float(options['ignore']): return False while not (input == 'y' or input == 'n'): print """The number of new choices missing processing for question %s is 1, there could be a non obvious match.\n Is '%s' a change of '%s' ? (y/n) """ % (question, new, old) input = raw_input() if input == 'y': return True return False
def cleanCommodities(self, data): for i in xrange(len(data)): if not data[i][0] is None: mindist = 100 topcomm = "" alternatives = [] for comm in self.comm_list: dist = distance(data[i][0].value, unicode(comm)) if dist < 7: alternatives.append((unicode(comm), dist)) if dist < mindist: mindist = dist topcomm = comm if dist == 0: data[i][0].value = topcomm data[i][0].confidence = 1.0 break #print unicode(data[i][0].value) #print topcomm #print alternatives.sort(key=lambda x: x[1]) optional_values = [j[0] for j in alternatives] maxdist = 4 if len(data[i][0].value) < 5: maxdist = 3 if mindist < maxdist: data[i][0].value = topcomm if mindist < 2: data[i][0].confidence = 1.0 else: data[i][0].confidence = 0.7 if mindist != 0: data[i][0].optional_values = [data[i][0].value] + optional_values else: data[i][0].confidence = 0.0 data[i][0].optional_values = [data[i][0].value] + optional_values # LOW MED HIGH if not data[i][4] is None: topratio = 0.0 toplev = "" for lev in self.levels[self.lang]: rat = ratio(data[i][4].value, unicode(lev)) if rat > topratio: topratio = rat toplev = lev data[i][4].value = toplev if not data[i][6] is None: topratio = 0.0 toplev = "" for lev in self.levels[self.lang]: rat = ratio(data[i][6].value, unicode(lev)) if rat > topratio: topratio = rat toplev = lev data[i][6].value = toplev
def levenshtein_ok(fl_title, fl_artist, ls_artist, ls_title): title_closeness = ratio(fl_title, ls_title) # TODO We should really examine these fields and see which are most unique. # We should also examine various threshold values if title_closeness > 0.80: artist_closeness = ratio(fl_artist, ls_artist) if artist_closeness > 0.80: return True else: return False
def match_platform(self, test_platform): test_platform = test_platform.lower() hi_score = 0 best_match_platform = {} for current_platform in platform.full_list: Lratio = ratio(test_platform, current_platform['name'].lower()) if ratio(test_platform, current_platform['shortcode'].lower()) > Lratio: Lratio = ratio(test_platform, current_platform['shortcode'].lower()) if ratio(test_platform, current_platform['alias'].lower()) > Lratio: Lratio = ratio(test_platform, current_platform['alias'].lower()) if Lratio > hi_score: hi_score = Lratio best_match_platform = current_platform return best_match_platform
def checkForRumorWords(uLang,tLang,text): #keywords = [] if uLang != "es" and tLang != "es": #keywords = readKeyWords("keyword2") return False flag = False text = removePunctuations(text) text = unidecode(text) for word1 in keywords: if len(word1) >2: for word2 in text.split(" "): if ratio(word1.lower(),word2.lower()) >= 0.9: #print "1~~~~~~~~~~~~~~~~~~~~~~~~~~~" #print word1 #print word2 #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~" flag = True break if flag: break for pairs in nltk.bigrams(text): pairWords = pairs[0]+' '+pairs[1] if ratio(word1.lower(), pairWords.lower()) >= 0.8: #print "2~~~~~~~~~~~~~~~~~~~~~~~~~~~" #print word1 print pairWords #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~" flag = True break if flag: break for tris in nltk.trigrams(text): trisWords = tris[0]+ ' ' + tris[1] + ' '+tris[2] if ratio(word1.lower(), trisWords.lower()) >=0.8: #print "3~~~~~~~~~~~~~~~~~~~~~~~~~~~" #print word1 print trisWords #print "~~~~~~~~~~~~hello~~~~~~~~~~~~~~~~" flag = True break if flag: break return flag
def computeMeanAveragePrecision(robot, layer, media, shots, qRelevant): # load submission qReturned = [] for medium in media: for annotation in robot.getAnnotations(layer=layer, medium=medium): shot = annotation.fragment if shot not in shots: continue personName = annotation.data.person_name confidence = annotation.data.confidence qReturned.append((shot, personName, confidence)) # sort submitted shot in decreasing confidence order qReturned = sorted(qReturned, key=lambda s: s[2], reverse=True) # per query average precision qAveragePrecision = {} for query, relevant in qRelevant.iteritems(): # filter shots by Levenshtein distance to query returned = [s for s, p, _ in qReturned if ratio(query, p) >= 0.95] # average precision for this query qAveragePrecision[query] = computeAveragePrecision(returned, relevant) # mean average precision mAP = np.mean(qAveragePrecision.values()) return mAP
def calculate_similarity(str1, str2): """ # Calculate document TF-IDF d_tfidf = dict() token_counts = self.doci[ID] max_count = max(token_counts.itervalues()) for term in token_counts: # TF: Raw frequency divided by the maximum raw frequency # of any term in the document. tf = token_counts[term] / max_count # IDF: Total number of documents in the corpus divided by # the number of documents where the term appears. idf = math.log(len(self.doci) / self.dict.dfs[term]) d_tfidf[term] = tf * idf # Calculate inner product inner_product = 0 for term in terms: if term in token_counts: inner_product += q_tfidf[term] * d_tfidf[term] # Calculate query length query_length = 0 for term in q_tfidf: query_length += q_tfidf[term] ** 2 query_length = math.sqrt(query_length) # Calculate document length doc_length = 0 for term in d_tfidf: doc_length += d_tfidf[term] ** 2 doc_length = math.sqrt(doc_length) # Calculate the cosine similarity cosine_sim = inner_product / (query_length * doc_length) ranked_pages[ID] = cosine_sim """ return ratio(str1, str2)
def find_suitable_el(name, collection): """ Finds and returns most string from collection using Levenshtein ratio algorithm """ best_score, el = max((ratio(name, el), el) for el in collection) if best_score >= 0.65: return el
def find_similar_words(db, config, request): """Edit distance function.""" # Check if lookup is cached hashed_query = hashlib.sha256() hashed_query.update(request['q'].encode("utf8")) hashed_query.update(str(request.approximate_ratio).encode('utf8')) approximate_filename = os.path.join(config.db_path, "data/hitlists/%s.approximate_terms" % hashed_query.hexdigest()) if os.path.isfile(approximate_filename): with open(approximate_filename, encoding="utf8") as fh: approximate_terms = fh.read().strip() return approximate_terms query_groups = get_all_words(db, request) file_path = os.path.join(config.db_path, "data/frequencies/normalized_word_frequencies") new_query_groups = [set([]) for i in query_groups] with open(file_path, encoding="utf8") as fh: for line in fh: line = line.strip() try: normalized_word, regular_word = line.split('\t') for pos, query_group in enumerate(query_groups): for query_word in query_group: if ratio(query_word, normalized_word) >= float(request.approximate_ratio): new_query_groups[pos].add(regular_word) except ValueError: pass new_query_groups = ' '.join([" | ".join(group) for group in new_query_groups]) cached_file = open(approximate_filename, "w", encoding="utf8") cached_file.write(new_query_groups) return new_query_groups
def number_match(fileparse): synonyms = set({u'number', u'integer', u'figure', u'digit', u'character', u'symbol', u'cardinal', u'ordinal', u'amount', u'quanity', u'total', u'aggregate', u'tally', u'quota', u'limit'}) pattern = r'[\d\s]+' for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: check_for_number = False for syn in synonyms: if ratio(fileparse.nps[cid]['text'].lower(), syn) > .9: check_for_number = True if not check_for_number: continue numbers = [] for parse in fileparse.parses: numbers.extend(findall(pattern, parse.text)) longest = '' if numbers: for num in numbers: if len(num) > len(longest): longest = num if longest: aid = _get_cid(fileparse.nps, longest, cid) if not aid: aid = _mk_coref_id() data = {'text': longest, 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid
def _build_match_list(self, query): # Build a list of tuples containing the similarity ratio ([0, 1]) ranked = [(ratio(txd, query), d_id, txd, d) for (d_id, txd, d) in self._data] # Sort the results by ratio (descending) ranked.sort(key=itemgetter(0), reverse=True) return ranked
def process_mean_distance_labels(parent, current, differ): changed = differ.changed() if not changed: return 0.0 distance = 0 for lang in changed: distance += 1 - ratio(current.labels[lang], parent.labels[lang]) return distance / len(changed)
def find_lst(word, lst, rat=0.8, ld=2, let=0.75): """This function returns words similar in spelling that are in in the specified 'lst' parameter to the inputted 'word' parameter. The 'rat' parameter specifies what Levenshtein word similarity ratio the words' similarity have to be greater than or equal to (defaults to 0.8). The 'ld' parameter specifies the maximum length difference between the words (defaults to 2). The 'let' parameter inputs a percentage in decimal form of how many similar letters are shared (defaults to 0.75 which is 75%). Note: Function is case-insensitive""" from Levenshtein import ratio from math import floor if not isinstance(lst, list): raise ValueError("Inputted 'lst' must be list") if not isinstance(word, str): raise ValueError("Inputted 'word' must be string.") if not isinstance(rat, (int, float)) or (rat > 1) or (rat < 0): raise ValueError("Inputted 'rat' must be integer/float and be <= 1 and >= 0") if not isinstance(ld, int) or not (0 <= ld): raise ValueError("Inputted 'ld' must be integer and be >= 0") if not isinstance(let, (int, float)) or (let > 1) or (let < 0): raise ValueError("Inputtud 'led' must be integer/float and be <= 1 and >= 0") for i in lst: if not isinstance(i, str): raise ValueError("All values in 'lst' must be string") word = word.lower() lst = map(lambda x: x.lower(), lst) sim = [] for i in lst: if ratio(word, i) >= rat: if abs(len(word) - len(i)) <= ld: num = 0 for x in i: if x in word: num += 1 if num >= floor(len(word) * let): sim.append([ratio(word, i), i]) sim.sort() sim = sim[::-1] result = [] for i in sim: result.append(i[1]) return result
def calc_venue_match_confidence(venue1, venue2): """ calculates distance between two venues by comparing names, social media handles, URLs and categories """ # just need the venue data, not the whole API response if venue1.get('response'): v1 = venue1['response']['venue'] else: v1 = venue1 if venue2.get('response'): v2 = venue2['response']['venue'] else: v2 = venue2 #levenshtein distance of names name_distance = ratio(v1['name'], v2['name']) url_match = 0.0 social_media_match = 0.0 category_match = 0.0 # compare URLs if v1.get('url') and v2.get('url'): if v1['url']: if urlparse(v1['url']).netloc: if urlparse(v2['url']).netloc: if urlparse(v1['url']).netloc == urlparse(v2['url']).netloc: url_match = 1.0 # compare social media if v1.get('contact') and v2.get('contact'): if v1['contact'].get('twitter') and v2['contact'].get('twitter'): if v1['contact']['twitter'] == v2['contact']['twitter'] and v1['contact']['twitter'] and v1['contact']['twitter'] != "none": social_media_match += 1.0 if v1['contact'].get('facebook') and v2['contact'].get('facebook'): if v1['contact']['facebook'] == v2['contact']['facebook'] and v1['contact']['facebook'] and v1['contact']['facebook'] != "none": social_media_match += 1.0 # compare categories if names match - match = +1.0, - no match = -1.0 if name_distance > 0.9: c1 = set() c2 = set() if v1.get('categories') and v2.get('categories'): for category in v1['categories']: c1.add(category) for category in v2['categories']: c2.add(category) common = c1 & c2 if len(common) > 0: category_match = 1.0 else: category_match = -1.0 return name_distance, url_match, social_media_match, category_match
def process_mean_distance_desc(parent, current, differ): changed = differ.changed() if not changed: return 0.0 distance = 0 for lang in changed: distance += ( 1 - ratio(current.descriptions[lang], parent.descriptions[lang])) return distance / len(changed)
def similarity_finder2(line, global_lang, thd=0.9): sim = {} for address in global_lang: line_ratio = ratio(global_lang[address], line) if line_ratio >= thd: line_ratio = ceil(line_ratio * 100) line2 = global_lang[address] sim.setdefault(line_ratio, []) sim[line_ratio].append(line2) return sim
def _do_match(self, needle, note_attrib_value): for token_note in note_attrib_value.split(): for token_needle in needle.split(): result = ratio(unicode(token_note), unicode(token_needle)) if self.debug: print "Searching match for %s against %s got: %f" % \ (token_needle, token_note, result) if result >= self.match_threshold: return True return False
def find_minister(text): best_ratio = 0 best_minister = '' for m in ministers: r = ratio(text, m) if r > best_ratio: best_ratio = r best_minister = m return best_minister
def checkForRetwitterUser(text): text = unidecode(text) flag = False for word1 in keyPerson: if ratio(word1.lower(), text.lower())>0.9: flag = True break return flag
def search_similar(self, line, thd=0.5): search_result = {} for filename in self.data: for index in self.data[filename]: line_ratio = ratio(line, self.data[filename][index]) if line_ratio >= thd: line_ratio = round(line_ratio * 100) search_result.setdefault(line_ratio, []) search_result[line_ratio].append((filename, index)) return search_result
def similarity_to_query(s): query_lower = query.lower() # Lolz keywords = ["gym", "dining", "james bond"] if any(query_lower == w for w in keywords) and s.fname == "Franco": return -1 fields = (s.fname, s.lname, s.full_name(), s.su, s.email, s.apt) return min(-ratio(f.lower(), query_lower) for f in fields if f)
def identify(feature): lon, lat = map(float,feature['geometry']['coordinates']) sname = feature['properties']['enh_snavn'] forname = feature['properties']['for_snavn'] ssrid = feature['properties']['enh_ssr_id'] #queryByName = (XAPI_URL + NAME_Q) % (lon-TOL, lat-TOL, lon+TOL, lat+TOL, quote_plus(sname)) queryWoName = (XAPI_URL) % (lon-TOL, lat-TOL, lon+TOL, lat+TOL) osm = tree.parse(queryWoName) names = osm.findall(".//tag[@k='name']") status[ssrid] = {} status[ssrid]['found']=False bestratio = 0 for name in names: osmname = unicode(name.get('v')) osmid = name.getparent().get('id') osmlon = name.getparent().get('lon') # only works for nodes, or we'll a) have to fetch references b) find a better distance calculation osmlat = name.getparent().get('lat') if osmlon: dx = float(osmlon)-lon dy = float(osmlat)-lat distance = sqrt(dx*dx+dy*dy) # GIS people are allowed to simplify like this else: distance = float("inf") if sname == osmname or forname == osmname: if status[ssrid]['found']: # multiple matches status[ssrid]['nodes'].append({"osmid":osmid, "distance":distance}) else: status[ssrid]['found']=True status[ssrid]['nodes']=[{"osmid":osmid, "distance":distance}] print "IDENTIFIED", osmname else: delta = max( ratio(osmname, sname), ratio(osmname, forname) ) if delta > bestratio: status[ssrid]['bestmatch'] = {"osmname":osmname, "osmid":osmid, "levenshtein":delta} bestratio = delta if not status[ssrid]['found']: print "Not found:", sname, "Best match:", str(status[ssrid].get('bestmatch',"None"))
def cliA(line, lenline, adapt, lenadapt, short_mm_len, extend, editDratio, shortReadsLen): reads = line #-----Begin full adaptor----------------------- tmpAdapt = adapt exactPos = line.rfind(tmpAdapt) if exactPos != -1 and exactPos > shortReadsLen: if extend == 'yes': return line[:exactPos] else: if exactPos + lenadapt == lenline: return line[:exactPos] #--------------------------------------------- else: if extend == 'yes': end = lenadapt + shortReadsLen - 1 # start, no less than # shortReadsLen else: end = lenline -1 for j in range(lenline, end, -1): tmpReads = line[j-lenadapt:j] if ratio(tmpReads, tmpAdapt) >= editDratio: # and tmpReads[0] == tmpAdapt[0]: return line[:j-lenadapt] #------------mismatch---------------- #---------------End full adaptor---------------- #--------------Begin partly clipped---------- for i in range(lenadapt-1, short_mm_len, -1): tmpAdapt = adapt[:i] exactPos = line.rfind(tmpAdapt) if exactPos != -1 and exactPos+i == lenline: return line[:exactPos] else: tmpReads = line[lenline-i:lenline] if ratio(tmpReads, tmpAdapt) >= editDratio: #and \ #tmpReads[0] == tmpAdapt[0]: return line[:lenline-i] #------------mismatch---------------- #---------------End full adaptor---------------- #--no process----------------------------------- return reads
def select_query(a, search_title): if search_title: from Levenshtein import ratio as ratio return (ratio(a['title'].encode("utf-8"), search_title.encode("utf-8")) > 0.50) \ and a['text'] and a['author'] \ and a['location'] and a['type'] == "Highlight" else: return a['title'] and a['text'] \ and a['author'] and a['location'] \ and a['type'] == "Highlight"
def suggest_lang(word): _cache = cache[args.lang] if not word in _cache: suggestions = speller.suggest(word) result = [(suggestion, r) for suggestion, r in ((suggestion, ratio(word, suggestion)) for suggestion in suggestions[:3] if ' ' not in suggestion and '-' not in suggestion) if (r > 0.9) ] _cache[word] = result return _cache[word]
def near_match(self, token_a, token_b): result = self.match(token_a, token_b) if result==0: return 0 r = ratio(token_a.token_string, token_b.token_string) # print(str(token_a)+" "+str(token_b)+" "+str(r)) if r > 0.6: return 1 else: return -1 pass
def _get_most_suitable_player(self): """ Looks in the roster for a player with an almost identical name. If any, it returns it """ score, pl_name = max((ratio(pl_name, self.name), pl_name) for pl_name in self.team_info.players_.keys()) _pl_name = self.name.split(' ')[0] _suit_player_name = pl_name.split(' ')[0] if _pl_name[:3] in _suit_player_name[:3] and score >= 0.65: return pl_name
def computeData(self, results): """Choses the record/artist couple with the best correspondancy.""" records = results['recording-list'] score_list = [] # Adding every record which title have a high enough similarity ratio. for record in records: title_r = ratio(record['title'], self.localData.title) if title_r >= MinSimilarityRatio: score_list.append({'title_r':title_r,'record':record,'artist_r':0,'artist':''}) # Finding the best artist correspondance for each record. best_artist='' best_ratio = 0 best_element = {'title_r':0, 'record':records[0], 'artist_r':0, 'artist':self.localData.artist} # Doesn't try to compare artists if no one has been given. if(self.localData.artist != ""): for element in score_list: best_local_name = '' best_local_ratio = 0 for artist_credit in element['record']['artist-credit']: local_name = artist_credit['artist']['name'] local_ratio = ratio(artist, self.localData.artist) # Locally maximizing the ratio for each artist corresponding to the record. if local_ratio > best_local_ratio: best_local_name = local_name best_local_ratio = local_ratio # We are trying to maximise the best ratio. if local_ratio > best_ratio: best_element = element best_ratio = local_ratio element['artist'] = best_local_name element['artist_r'] = local_ratio if best_ratio > MinSimilarityRatio: record = best_element self.remoteData.title = record['record']['title'] self.remoteData.artist = record['artist'] else: self.remoteData.artist = '' self.remoteData.title = '' else: self.remoteData.title = score_list[0]['record']['title'] self.remoteData.artist = score_list[0]['record']['artist-credit'][0]['artist']['name']
def get_similar_by_levens(incorrect): """Take all words of similar distance and return five most similar as measured by Levenstein distance""" vals = {} WORDS = get_smlar_len_words(incorrect) for correct in WORDS: similarity_ratio = ratio(incorrect, correct) if similarity_ratio > 0.65: vals[correct] = similarity_ratio all_suggestions = sorted([c for c, v in vals.items()], key=lambda v: v) return all_suggestions[:5]
def first_sentence_score(dataset: list, refer_dataset): """ no useful :param dataset: :param refer_dataset: :return: """ similarity_score_list = [] for sample in dataset: sample_refer_dataset = random.sample(refer_dataset, 50) first_sentence = sample['essay_sent'][0] refer_first_sentences = [sample['essay_sent'][0] for sample in sample_refer_dataset] similarities = [ratio(first_sentence, sent) for sent in refer_first_sentences] scores = [refer_sample['domain1_score'] for refer_sample in sample_refer_dataset] similarity_score = np.average([similarity * score for similarity, score in zip(similarities, scores)]) sample['first_sentence_score'] = similarity_score similarity_score_list.append(similarity_score) return {'first_sentence_score': {'mean': np.mean(similarity_score_list), 'std': np.std(similarity_score_list)}}
def lookup(self, s): if not self.d: self.load() try: print "Using self" return self.d[s.lower()] except KeyError: try: print "Using stemmer: " + self.stemmer.stem(s).lower() return self.d[self.stemmer.stem(s).lower()] except KeyError: try: print "Using lemmatizer: " + self.lemmatizer.lemmatize( s).lower() return self.d[self.lemmatizer.lemmatize(s).lower()] except KeyError: (score, match) = max((ratio(s, t), t) for t in self.d) self.d[s] = self.d[match] print "Using Levenshtein: " + match return self.d[match]
def text_similarity_score(dataset: list, refer_dataset): """ no useful :param dataset: :param refer_dataset: :return: """ similarity_score_list = [] for sample in dataset: sample_refer_dataset = random.sample(refer_dataset, 20) similarities = [ratio(sample['essay'], refer_sample['essay']) for refer_sample in sample_refer_dataset] # top_k = simi.argsort()[-int(refer_matrix.shape[1]/50):][::-1].tolist() scores = [refer_sample['domain1_score'] for refer_sample in sample_refer_dataset] similarity_score = np.average([similarity * score for similarity, score in zip(similarities, scores)]) sample['text_similarity_score'] = similarity_score similarity_score_list.append(similarity_score) # print(np.mean(similarity_score), sample['domain1_score']) return {'text_similarity_score': {'mean': np.mean(similarity_score_list), 'std': np.std(similarity_score_list)}}
def score_reconciliation(txn, payment): words = txn.payee.replace('-', ' ').split(' ') bankref_distances = [ratio(w, payment.bankref) for w in words] # Get the two best matches, for the two parts of the bankref bankref_score = sum(sorted(bankref_distances)[-2:]) name_score = jaro(txn.payee, payment.user.name) other_score = 0.0 if txn.amount == payment.amount: other_score += 0.4 if txn.account.currency == payment.currency: other_score += 0.6 # check posted against expiry? app.logger.debug('Scores for txn %s payment %s: %s %s %s', txn.id, payment.id, bankref_score, name_score, other_score) return bankref_score + name_score + other_score
def annotate(self, training_set): #Levenshtein distance - minimum number of single character edits distance_udf = udf(lambda x, y: distance(x, y), IntegerType()) #Levenshtein ratio - similarity of two strings ratio_udf = udf(lambda x, y: ratio(x, y), DoubleType()) #Jaro - similarity score jaro_udf = udf(lambda x, y: jaro(x, y), DoubleType()) #Jaro-winkler - similarity score, which favors strings that match prefix from the beginning jaro_winkler_udf = udf(lambda x, y: jaro_winkler(x, y), DoubleType()) #fuzz partial ratio - gives a score based on how well parts of a string match another fuzz_partial_ratio_udf = udf( lambda x, y: fuzz.partial_ratio(x, y) / 100, DoubleType()) training_set = training_set.withColumn("distance", distance_udf("concept_name_1", "concept_name_2")) \ .withColumn("ratio", ratio_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro", jaro_udf("concept_name_1", "concept_name_2")) \ .withColumn("jaro_wrinkler", jaro_winkler_udf("concept_name_1", "concept_name_2")) \ .withColumn("fuzz_partial_ratio", fuzz_partial_ratio_udf("concept_name_1", "concept_name_2")) return training_set
def approximate_answers(q): max_score = 0 answers = "" prediction = "" for idx, row in qa_datasets.iterrows(): score = ratio(row['Question'], q) if score >= 0.8: return row['Answer'], score, row['Question'] elif score > max_score: max_score = score answer = row["Answer"] prediction = row["Question"] if max_score > 0.51: return answer, max_score, prediction else: return "Maap aku gak ngerti kamu ngomong apa... :(", max_score, prediction
def matchQuery(self, globs, text, regexList): score = 0 name = self.nameU typeName = self.typeNameU for regex in regexList: if not regex.search(name): return False for t in globs: pos = name.find(t) if pos >= 0: k = max(1.0 - float(pos) / 20.0, 0.0) score += (k * k * 1.0 + 0.1) score += (ratio(t, name) * 0.1) if globs[0] in typeName: findInType = True score *= 1.1 self.matchScore = score return score > 0
def crossref_query_title(title): """Contacts Crossref API for DOI of a paper The paper is identified by its title. The function retrieves the first 5 results, and searches for the one with maximum similarity to the original title. Raises an HTTPError in case of failure. Args: title: a str with the title of the paper whose DOI we are looking for """ api_url = "https://api.crossref.org/works?" params = {"rows": "5", "query.title": title} url = api_url + urlencode(params, quote_via=quote_plus) request = Request(url) request.add_header( "User-Agent", "doi4bib utility\ (https://github.com/sharkovsky/doi4bib)") try: ret = urlopen(request) content = ret.read() data = json.loads(content.decode('utf-8')) items = data["message"]["items"] most_similar = EMPTY_RESULT for item in items: title = item["title"].pop() result = { "crossref_title": title, "similarity": ratio(title.lower(), params["query.title"].lower()), "doi": item["DOI"] } if most_similar["similarity"] < result["similarity"]: most_similar = result return {"success": True, "result": most_similar} except HTTPError as httpe: return {"success": False, "result": EMPTY_RESULT, "exception": httpe}
def findTorrentPage(phenny, input): import urllib from BeautifulSoup import BeautifulSoup, SoupStrainer from Levenshtein import ratio if '-' not in input.group(2): raise Exception() return (artist, album) = input.group(2).split('-') functionEnd = artist.find(' ') artist = artist[functionEnd + 1:].strip() album = album.strip() searchString = artist + u' - ' + album data = {'artistname': artist, 'groupname': album} data = urllib.urlencode(data) url = 'https://ssl.what.cd/torrents.php?action=advanced&' + data html = getHTML(url) tableStrainer = SoupStrainer(id='torrent_table') tableResults = BeautifulSoup(html, tableStrainer) firstArtist = tableResults.find('a', href=re.compile('artist.php\?id\=')) if firstArtist is None: raise SyntaxError() return foundArtist = firstArtist.string foundAlbum = firstArtist.next.next.next.string foundString = foundArtist + ' - ' + foundAlbum searchRatio = ratio(searchString.lower(), foundString.lower()) return ('https://ssl.what.cd/' + tableResults.find( 'a', href=re.compile('torrents.php\?id\='))['href'], searchRatio)
def score_reconciliation(txn, payment): words = list(filter(None, re.split('\W+', txn.payee))) bankref_parts = [payment.bankref[:4], payment.bankref[4:]] bankref_distances = [ratio(w, p) for w in words for p in bankref_parts] # Get the two best matches, for the two parts of the bankref # A match gives 1.0, a 2-char substring 0.666, and a 6-char superstring 0.857 bankref_score = sum(sorted(bankref_distances)[-2:]) name_score = jaro(txn.payee, payment.user.name) other_score = 0.0 if txn.amount == payment.amount: other_score += 0.4 if txn.account.currency == payment.currency: other_score += 0.6 # check posted against expiry? app.logger.debug('Scores for txn %s payment %s: %s %s %s', txn.id, payment.id, bankref_score, name_score, other_score) return bankref_score + name_score + other_score
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys # Use python-Levenshtein if available from Levenshtein import ratio # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = ratio(statement_text, other_statement_text) return similarity
def get_score(query, text): """ Uses Levenshtein's algorithm + some improvements to the score :returns: number between 0 and 100 """ if not query or not text: return 0 query = query.lower() text = text.lower() score = ratio(query, text) * 100 # increase score if a word from text starts with a query for text_part in text.split(' '): if text_part.startswith(query): score += 30 break # increase score if each separate group in indexes is a beginning of a word in text # example for query 'fiwebr' groups 'fi', 'we', and 'br' are matching word beginnings # of text 'Firefox Web Browser' # increase score for each such group increment = 10 i = 0 # query iterator lq = len(query) for j, char in enumerate(text): # run until query ends and check if a query char. equals to current text char if i < lq and query[i] == char: # if char from query matches beginning of text or beginning of a word inside the text, increase the score if j == 0 or text[j - 1] in ' .(-_+)': score += increment i += 1 elif i == lq: break return min(100, score)
def get_publication(title): EMPTY_RESULT = { "crossref_title": "", "similarity": 0, "doi": "" } api_url = "https://api.crossref.org/works?" params = {"rows": "5", "query.bibliographic": title} url = api_url + urlencode(params, quote_via=quote_plus) request = Request(url) request.add_header("User-Agent", "OpenAPC DOI Importer (https://github.com/OpenAPC/openapc-de/blob/master/python/import_dois.py; mailto:[email protected])") full_data = None try: ret = urlopen(request) content = ret.read() data = json.loads(content) items = data["message"]["items"] most_similar = EMPTY_RESULT for item in items: if "title" not in item: continue title = item["title"].pop() result = { "crossref_title": title, "similarity": ratio(title.lower(), params["query.bibliographic"].lower()), "doi": item["DOI"] } if most_similar["similarity"] < result["similarity"]: most_similar = result full_data = item return {"success": True, "result": most_similar, "crossref": full_data} except HTTPError as httpe: return {"success": False, "result": EMPTY_RESULT, "exception": httpe}
def MatchStops(): dft = pd.read_csv('./Data/turnstile_191026.txt') dfs = pd.read_csv('./Data/stops.txt') turnstile_stop_names = [] stop_stop_names = [] matches = [] for x in sorted(dft['STATION'].unique()): stop_name = x lower_stop_name = x.lower() turnstile_stop_names.append((lower_stop_name, stop_name)) for x in sorted(dfs['stop_name'].unique()): stop_id = dfs['stop_id'][dfs['stop_name'] == x] stop_stop_names.append((x.lower(), stop_id[stop_id.index[0]])) for turn in turnstile_stop_names: ratio_list = [] for stop in stop_stop_names: ratio_list.append( (stop[0], stop[1], turn[0], turn[1], ratio(turn[0], stop[0]))) best_match = sorted(ratio_list, key=lambda x: x[-1])[-1] if best_match[-1] < 0.7: print('{}-->{}'.format(turn[1], best_match)) else: matches.append((turn[1], best_match[1])) pd.DataFrame(matches, columns=['STATION', 'stop_id']).to_csv('test.csv')
def quick_ratio(self): # This is usually quick enough :o) if not self._ratio: self._ratio = ratio(self._str1, self._str2) return self._ratio
def levDistanceCompare(str1, str2, value): if ratio(str1, str2) > value: return True else: return False
def populate_pokeset(pokeset, skip_ev_check=False): """ Reads in data for one pokeset and populates it with all additionally available data. This includes types of Pokémon or per-move data like PP, power or types. Arguments: pokeset: base data of the set to populate. see the format specification for details. skip_ev_check: Defaults to False. If True, allows illegal movesets (produces a warning instead of an error) Throws: ValueError: If the data is not fully parsable. the ValueError's description contains further details on the error that occured. Returns: The populated set. The passed data is not modified """ # I am sorry that this function is so big and partly copy-pasted, # but it just does a lot of equally boring things like processing # special cases. I couldn't come up with a structure that wouldn't # just feel forced. It could be better, but it could also be worse, # and to be honest it's easy enough to maintain (for me at least). # make deepcopy to not modify original data pokeset = deepcopy(pokeset) # check if there are wrongly capitalized keys for key, value in list(pokeset.items()): key_lower = key.lower() if key_lower != key: warn("Key should be all lowercase: %s" % key) del pokeset[key] pokeset[key_lower] = value # check that all obligatory fields are present present_fields = set(pokeset.keys()) missing_fields = _OBLIGATORY_FIELDS - present_fields if missing_fields: raise ValueError("pokeset is missing obligatory fields: %s" % ", ".join(missing_fields)) # check if there are unknown fields unrecognized_fields = present_fields - (set(_OPTIONAL_FIELDS.keys()) | _OBLIGATORY_FIELDS) if unrecognized_fields: raise ValueError("pokeset has unrecognized fields: %s" % ", ".join(unrecognized_fields)) # trim all leading and trailing whitespaces # TODO test if the yaml parser already does this #for k, v in pokeset.items(): # pokeset[k] = v.strip() # fill in optional fields for key, default in _OPTIONAL_FIELDS.items(): if key not in pokeset: pokeset[key] = deepcopy(default) # check validity of names if not pokeset["setname"] or not isinstance(pokeset["setname"], str): raise ValueError("setname must be a non-empty string") custom_displayname = False if pokeset["displayname"] is not None: custom_displayname = True if not pokeset["displayname"] or not isinstance( pokeset["displayname"], str): raise ValueError("displayname, if set, must be a non-empty string") # check and populate species species_raw = pokeset["species"] if species_raw is None: raise ValueError("Invalid species: %s" % (species_raw, )) species, perfect_match = _get_by_index_or_name(gen4data.POKEDEX, species_raw, "species", gen4data.get_pokemon, gen4data.find_pokemon) if not perfect_match: warn("Didn't recognize species %s, but assumed %s." % (species_raw, species["name"])) pokeset["species"] = species # check tags tags = pokeset["tags"] if not isinstance(tags, list) or not all( isinstance(tag, str) for tag in tags): raise ValueError("tags must be a list of strings") pokeset["tags"] = tags # replace None-default for ingamename if pokeset["ingamename"] is None: pokeset["ingamename"] = species["name"].upper() if pokeset["shiny"]: pokeset["ingamename"] = pokeset["ingamename"][:8] + "-S" # check length of ingamename if not 1 <= len(pokeset["ingamename"]) <= 10: raise ValueError( "ingamename must be between 1 and 10 characters long: %s" % pokeset["ingamename"]) # check happiness if not isinstance(pokeset["happiness"], int): raise ValueError("happiness must be a number.") # check and populate ability. is a list ability = [] ability_raw = pokeset["ability"] if not isinstance(ability_raw, list): ability_raw = [ability_raw] if not ability_raw: raise ValueError("Ability cannot be an empty list.") for ability_raw_single in ability_raw: ability_single, perfect_match = _get_by_index_or_name( gen4data.ABILITIES, ability_raw_single, "ability", gen4data.get_ability, gen4data.find_ability) if not perfect_match: warn("Didn't recognize ability %s, but assumed %s." % (ability_raw_single, ability_single["name"])) ability.append(ability_single) if len(set(a["id"] for a in ability)) < len(ability): raise ValueError("All abilities supplied must be unique: %s" % ", ".join(a["name"] for a in ability)) pokeset["ability"] = ability # check and populate item. is a list item = [] item_raw = pokeset["item"] if not isinstance(item_raw, list): item_raw = [item_raw] if not item_raw: raise ValueError("Item cannot be an empty list.") for item_raw_single in item_raw: item_single, perfect_match = _get_by_index_or_name( gen4data.ITEMS, item_raw_single, "item", gen4data.get_item, gen4data.find_item) if not perfect_match: warn("Didn't recognize item %s, but assumed %s." % (item_raw_single, item_single["name"])) item.append(item_single) if len(set(i["id"] for i in item)) < len(item): raise ValueError("All items supplied must be unique: %s" % ", ".join(i["name"] for i in item)) pokeset["item"] = item # check and populate ball. is a list ball = [] ball_raw = pokeset["ball"] if not isinstance(ball_raw, list): ball_raw = [ball_raw] if not ball_raw: raise ValueError("Ball cannot be an empty list.") for ball_raw_single in ball_raw: ball_single, perfect_match = _get_by_index_or_name( gen4data.ITEMS, ball_raw_single, "ball", gen4data.get_ball, gen4data.find_ball) if not ball_single["name"].endswith(" Ball"): raise ValueError("Invalid ball: %s" % ball_single) if not perfect_match: warn("Didn't recognize ball %s, but assumed %s." % (ball_raw_single, ball_single["name"])) ball.append(ball_single) if len(set(b["name"] for b in ball)) < len(ball): raise ValueError("All balls supplied must be unique: %s" % ", ".join(b["name"] for b in ball)) pokeset["ball"] = ball # check gender gender = pokeset["gender"] if not isinstance(gender, list): gender = [gender] for gender_single in gender: if gender_single not in ("m", "f", None): raise ValueError( "gender can only be 'm', 'f' or not set (null), but not %s" % (gender_single, )) if len(gender) > 1 and None in gender: raise ValueError("non-gender cannot be mixed with m/f") if len(set(gender)) < len(gender): raise ValueError("All genders supplied must be unique: %s" % ", ".join(gender)) pokeset["gender"] = gender # check level level = pokeset["level"] if not (isinstance(level, int) and 1 <= level <= 100): raise ValueError("level must be a number between 1 and 100") # check and populate nature. might be defined as "+atk -def" or similar nature_raw = pokeset["nature"] if not isinstance(nature_raw, str): raise ValueError("Invalid nature: %s" % (nature_raw, )) stats_regex = "|".join(stats.statnames) match = re.match(r"^\+({0})\s+-((?:\1){0})$".format(stats_regex), nature_raw) if match: increased = match.group(1) decreased = match.group(2) matching_nature = [ n for n in gen4data.NATURES if n["increased"] == increased and n["decreased"] == decreased ] if matching_nature: nature_raw = matching_nature[0]["name"] nature, perfect_match = _get_by_index_or_name(gen4data.NATURES, nature_raw, "nature", gen4data.get_nature, gen4data.find_nature) if not perfect_match: warn("Didn't recognize nature %s, but assumed %s." % (nature_raw, nature["name"])) pokeset["nature"] = nature # check IVs ivs = pokeset["ivs"] if isinstance(ivs, int): ivs = {name: ivs for name in stats.statnames} if not isinstance(ivs, dict): raise ValueError("Invalid IVs: %s" % (ivs, )) if set(stats.statnames) != set(ivs.keys()): raise ValueError("ivs must contain the following keys: %s" % ", ".join(stats.statnames)) if not all(isinstance(v, int) for v in ivs.values()): raise ValueError("Invalid IV value in IVs: %s" % (ivs, )) if not all(0 <= val <= 31 for val in ivs.values()): raise ValueError("All IVs must be between 0 and 31.") pokeset["ivs"] = ivs # check EVs evs = pokeset["evs"] if isinstance(evs, int): evs = {name: evs for name in stats.statnames} if not isinstance(evs, dict): raise ValueError("Invalid EVs: %s" % (evs, )) if set(stats.statnames) != set(evs.keys()): raise ValueError("evs must contain the following keys: %s" % ", ".join(stats.statnames)) if not all(isinstance(v, int) for v in evs.values()): raise ValueError("Invalid EV value in EVs: %s" % (evs, )) if not all(0 <= val for val in evs.values()): raise ValueError("All EVs must be >= 0.") if not all(val <= 252 for val in evs.values()): message = "All EVs must be <= 252." if skip_ev_check: warn(message) else: raise ValueError(message) ev_sum = sum(val for val in evs.values()) if ev_sum > 510: message = "Sum of EV must not be larger than 510, but is %d" % ev_sum if skip_ev_check: warn(message) else: raise ValueError(message) for key, value in evs.items(): if value % 4 != 0: warn( "EV for %s is %d, which is not a multiple of 4 (wasted points)" % (key, value)) pokeset["evs"] = evs # TODO outsorce singular move procession # check and populate moves moves = [] moves_raw = pokeset["moves"] if not 1 <= len(moves_raw) <= 4: raise ValueError( "Pokémon must have between 1 and 4 moves, but has %d" % len(moves_raw)) for move_raw in moves_raw: move = [] if not isinstance(move_raw, list): move_raw = [move_raw] for move_raw_single in move_raw: pp = None pp_ups = 0 # move might have pp-up and fixed pp information pp_info = re.search(r"\(\+\d+\)|\(=\d+\)|\(\+\d+/=\d+\)$", move_raw_single) if pp_info: move_raw_single = move_raw_single[:pp_info.start() - 1] for bit in pp_info.group(0).strip("()").split("/"): if bit.startswith("+"): pp_ups = int(bit[1:]) elif bit.startswith("="): pp = int(bit[1:]) move_single, perfect_match = _get_by_index_or_name( gen4data.MOVES, move_raw_single, "move", gen4data.get_move, gen4data.find_move) if not perfect_match: warn("Didn't recognize move %s, but assumed %s." % (move_raw_single, move_single["name"])) move_single["pp_ups"] = pp_ups pp = pp or move_single["pp"] pp = int(pp * (1 + 0.2 * pp_ups)) move_single["pp"] = pp move.append(move_single) moves.append(move) pokeset["moves"] = moves # check rarity rarity = pokeset["rarity"] if not (isinstance(rarity, (int, float)) and rarity >= 0.0): raise ValueError("rarity must be a number greater or equal to 0.0") if rarity > 10.0: warn( "rarity is %d, which is surprisingly high. Note that 1.0 is the default " "and high values mean the Pokémon gets chosen more often." % rarity) # fix default biddable value if pokeset["biddable"] is None: pokeset["biddable"] = not pokeset["shiny"] if not isinstance(pokeset["biddable"], bool): raise ValueError("biddable must be a boolean (true or false), not %s" % type(pokeset["biddable"])) # fix default hidden value if pokeset["hidden"] is None: pokeset["hidden"] = pokeset["shiny"] if not isinstance(pokeset["hidden"], bool): raise ValueError("hidden must be a boolean (true or false), not %s" % type(pokeset["hidden"])) if pokeset["biddable"] and pokeset["hidden"]: warn("Set is biddable, but also hidden, which doesn't make sense.") if pokeset["shiny"] and not pokeset["hidden"]: warn("Set is shiny, but not hidden, which means it is not secret " "and usable in token matches at any time. Is this intended?") # fix displayname if pokeset["displayname"] is None: pokeset["displayname"] = pokeset["species"]["name"] # formnames get handled below # check form form = pokeset["form"] if not isinstance(form, int): if not isinstance(form, str): raise ValueError("form must be a formnumber or a string, not %s" % type(form)) formnumber = forms.get_formnumber(species["id"], form) if formnumber is None: raise ValueError("Unrecognized form %s for species %s" % (form, species["name"])) form = formnumber pokeset["form"] = form formname = forms.get_formname(species["id"], form) if formname is None and form != 0: raise ValueError("Species %s has no form %s." % (species["name"], form)) # special case: all forms. fix displayname formname = forms.get_formname(species["id"], form) if formname and not custom_displayname: pokeset["displayname"] += " " + formname # special case: Deoxys. Fix basestats (displayname already fixed) if species["name"] == "Deoxys": deoxys_form = forms.get_formname(species["id"], form) species["basestats"] = gen4data.DEOXYS_BASESTATS[deoxys_form] # special case: Arceus. Handle as form. Also fix type if species["name"] == "Arceus": item = pokeset["item"] if len(item) > 1: raise ValueError("Arceus currently must have a fixed item") arceus_type = forms.get_multitype_type(item[0]) pokeset["species"]["types"] = [arceus_type] if not custom_displayname: pokeset["displayname"] += " " + arceus_type #pokeset["form"] = gen4data.TYPES.index(arceus_type) # special case: Wormadam. Fix type if species["name"] == "Wormadam": wormadam_types = ("Grass", "Ground", "Steel") pokeset["species"]["types"] = ["Bug", wormadam_types[form]] # add stats pokeset["stats"] = {} for statname in stats.statnames: basestat = species["basestats"][statname] ev = evs[statname] iv = ivs[statname] level = pokeset["level"] pokeset["stats"][statname] = stats.calculate_stat( basestat, ev, iv, statname, nature, level) # special case: Shedinja. Always 1 HP if species["name"] == "Shedinja": pokeset["stats"]["hp"] = 1 # add shininess to display name if pokeset["shiny"] and not custom_displayname: pokeset["displayname"] += " (Shiny)" # add autogenerated tags if pokeset["biddable"]: pokeset["tags"].append("biddable") if pokeset["hidden"]: pokeset["tags"].append("hidden") if pokeset["shiny"]: pokeset["tags"].append("shiny") pokeset["tags"].append("species+%d" % pokeset["species"]["id"]) pokeset["tags"].append("species+%s" % normalize_name(pokeset["species"]["name"])) for type_ in pokeset["species"]["types"]: pokeset["tags"].append("type+%s" % type_.lower()) pokeset["tags"].append("level+%d" % pokeset["level"]) pokeset["tags"].append("form+%d" % pokeset["form"]) for ability_ in pokeset["ability"]: if ability_: pokeset["tags"].append("ability+%s" % normalize_name(str(ability_["name"]))) pokeset["tags"].append("setname+%s" % normalize_name(pokeset["setname"])) if pokeset["rarity"] > 0: pokeset["tags"].append("matchmaker-enabled") # ensure no duplicate tags pokeset["tags"] = sorted(set(pokeset["tags"])) # check combinations and separations combinations = pokeset["combinations"] if not isinstance(combinations, list) or not all( isinstance(c, list) for c in combinations): raise ValueError("combinations must be a list of lists.") if not all(isinstance(s, str) or s is None for s in chain(*combinations)): raise ValueError("combination items must be strings or null") separations = pokeset["separations"] if not isinstance(separations, list) or not all( isinstance(s, list) for s in separations): raise ValueError("separations must be a list of lists.") if not all(isinstance(s, str) or s is None for s in chain(*separations)): raise ValueError("separation items must be strings or null") movenames = sum([movelist for movelist in pokeset["moves"]], []) movenames = list(set(move["name"] for move in movenames)) all_things = (movenames + [p["name"] for p in pokeset["item"]] + [a["name"] for a in pokeset["ability"]]) ambiguities = set(item for item, count in Counter(all_things).items() if count > 1) all_things = set(all_things) for com in combinations: if any(c in ambiguities for c in com): raise ValueError( "Can't use %s in combinations, as it is ambiguous." % (com, )) rest = set(com) - all_things for r in list(rest): if not r: continue for thing in all_things - {None}: if ratio(thing.lower(), r.lower()) > 0.9: if is_difference_significant(thing, r): warn("Didn't recognize combination %s, but assumed %s." % (r, thing)) rest.remove(r) com.remove(r) com.append(thing) break if rest: raise ValueError( "All things referenced in combination must be present in set. Missing: %s" % ", ".join(rest)) for sep in separations: if any(s in ambiguities for s in sep): raise ValueError( "Can't use %s in separations, as it is ambiguous." % (sep, )) rest = set(sep) - all_things for r in list(rest): if not r: continue for thing in all_things - {None}: if ratio(thing.lower(), r.lower()) > 0.9: if is_difference_significant(thing, r): warn( "Didn't recognize separation %s, but assumed %s." % (r, thing)) rest.remove(r) sep.remove(r) sep.append(thing) break if rest: raise ValueError( "All things referenced in separation must be present in set. Missing: %s" % ", ".join(rest)) # TODO validate that the combinations and separations even allow for a functioning set to be generated return pokeset
def cleanCommodities(self, data, levels): for i in xrange(len(data)): if not data[i][0] is None: mindist = 100 topcomm = "" alternatives = [] for comm in self.comm_list: #print data[i][0].value #print unicode(comm) dist = distance(unicode(data[i][0].value), unicode(comm)) if dist < 7: alternatives.append((unicode(comm), dist)) if dist < mindist: mindist = dist topcomm = comm if dist == 0: data[i][0].value = topcomm data[i][0].confidence = 1.0 break #print unicode(data[i][0].value) #print topcomm #print alternatives.sort(key=lambda x: x[1]) optional_values = [j[0] for j in alternatives] maxdist = 4 if len(data[i][0].value) < 5: maxdist = 3 if mindist < maxdist: data[i][0].value = topcomm if mindist < 2: data[i][0].confidence = 1.0 else: data[i][0].confidence = 0.7 if mindist != 0: data[i][0].optional_values = [data[i][0].value ] + optional_values else: data[i][0].confidence = 0.0 data[i][0].optional_values = [data[i][0].value ] + optional_values # LOW MED HIGH if not data[i][4] is None and levels: try: topratio = 0.0 toplev = "" for lev in self.levels[self.lang]: if data[i][4].value is None: print "None!" rat = ratio(unicode(data[i][4].value), unicode(lev)) if rat > topratio: topratio = rat toplev = lev data[i][4].value = toplev except: pass if not data[i][6] is None and levels: try: topratio = 0.0 toplev = "" for lev in self.levels[self.lang]: rat = ratio(data[i][6].value, unicode(lev)) if rat > topratio: topratio = rat toplev = lev data[i][6].value = toplev except: pass
def findBestMatch(needle, haystack): return max(haystack, key=lambda x: ratio(needle, x))
def similar(string, array): for i in array: return ratio(string, i)
for quote in quotes: quote_len = len(quote) if quote_len > greatest_length: greatest_length = quote_len if quote_len < least_length: least_length = quote_len for comment in reddit.subreddit( 'AskReddit+movies+funny+pics').stream.comments(): text = comment.body.lower() len_text = len(text) if len_text + 7 > greatest_length or len_text - 4 < least_length: continue greatest = 0 best_quote = '' print(text) for quote in quotes: value = ratio(text, quote) if value > .75: print(text, quote) """ if value > greatest: greatest = value best_quote = quote """ # print(greatest, text, best_quote)
def edit_dist_of(sent0, sent1, item): x, y = item return ratio(sent0[x], sent1[y])
def similarity_ratios(l): ratios = {} for i in range(len(l) - 1): for j in range(i + 1, len(l)): ratios[(l[i], l[j])] = ratio(l[i], l[j]) return ratios
def comparar_cad(cadena1, cadena2): if ratio(cadena1, cadena2) >= 0.8: return True return False
def apply_ratio(col1, col2): return ratio(col1, col2)
csmar_codes = pd.read_csv(os.path.join(base_dir, "CSMAR_Codes.csv")) bhc_codes = pd.read_csv(os.path.join(base_dir, "BHC_Codes.csv")) csmar_codes bhc_codes csmar_codes['CSMAR Variable Description'] bhc_codes['Variable Description'] from scipy.spatial.distance import cdist from Levenshtein import ratio arr1 = np.array(csmar_codes['CSMAR Variable Description']) arr2 = np.array(bhc_codes['Variable Description']) matrix = cdist(arr2.reshape(-1, 1), arr1.reshape(-1, 1), lambda x, y: ratio(x[0], y[0])) df = pd.DataFrame(data=matrix, index=arr2, columns=arr1) df_sim = df.transpose() sim_score = .7 for ind in range(0, len(df_sim.index)): if len(df_sim.iloc[ind, :][df_sim.iloc[ind, :] > sim_score]) > 0: print("CSMAR index:", df_sim.index[ind]) print(df_sim.iloc[ind, :][df_sim.iloc[ind, :] > sim_score]) #Get relevant BHC and CSMAR Data.
def ratio(self): if not self._ratio: self._ratio = ratio(self._str1, self._str2) return self._ratio
def compare(string1, string2): ''' compares two strings on char similarity with levenshtein ratio returns a value between 0 and 1, 0 no overlap, 1 complete overlap ''' return ratio(string1, string2)
def levenshtein(pair_of_sentences): l = ratio(pair_of_sentences[0].lower(), pair_of_sentences[1].lower()) assert l >= 0 and l <= 1 return l
def test_levenshtein_ratio(): expected_ratio = ratio(normalize_input(TORONTO.name), KEY) actual_ratio = levenshtein._ratio(TORONTO.name, KEY) assert expected_ratio == actual_ratio