def compute_levensthein_distances_in_ground_truth(): # Computes the levensthein distances within similar and non similar pairs of the ground truth # labels, docs = load_data() s, y_t = load_SME_binary() df = pd.read_excel(r'test-lab.xlsx') dval = df.values for i in range(len(dval)): doc1 = dval[i, 0].strip() doc1_ = doc1 for x in labels: if x.startswith(doc1): doc1_ = x doc2 = dval[i, 1].strip() doc2_ = doc2 for x in labels: if x.startswith(doc2): doc2_ = x label = dval[i, 2] m = StringMatcher(seq1=docs[labels.index(doc1_)], seq2=docs[labels.index(doc2_)]) print(doc1, doc2, label, m.ratio(), sep=",") return
async def getBannedUser(ctx, user): user = str(user).lower() matcher = StringMatcher() userdata = [] for _user in await ctx.guild.bans(): _user = _user.user member = { "name": _user.name, "fullname": _user.name + "#" + str(_user.discriminator), "mention": "<@!" + str(_user.id) + ">", "nickname": _user.display_name, "discriminator": str(_user.discriminator), "discrim2": "#" + str(_user.discriminator), "id": str(_user.id) } data = {} for item in member: matcher.set_seqs(user, member[item].lower()) data[item] = matcher.quick_ratio() data["_id"] = member["id"] userdata.append(data) matches = {} for user in userdata: for key in user: if key != "_id": if user[key] > .5: matches[user["_id"]] = user[key] break if not matches: raise ValueError(f"NoSuchUser: {user}") id = int(max(matches.items(), key=itemgetter(1))[0]) for banned in await ctx.guild.bans(): if banned.user.id == id: return banned.user
def levenshtein(string1, string2): if string1 is None: string1 = "" if string2 is None: string2 = "" string_matcher = StringMatcher(seq1=string1.lower(), seq2=string2.lower()) return string_matcher.distance()
def compare(self, first_statement, second_statement): statement = first_statement.lower() other_statement = second_statement.lower() similarity = SequenceMatcher(None, statement, other_statement) return round(similarity.ratio(), 10)
def get_match_score(phrase, words, min_distance=2): score = 0 phrase_len = len(''.join(phrase)) for p in phrase: matcher = StringMatcher(seq1=p) for w in words: matcher.set_seq2(w) match_distance = matcher.distance() if match_distance <= min_distance: score += max(0, len(p) - match_distance) / phrase_len return score
def match_ngrams(tokens: List, min_similarity=.96): found_topics = defaultdict(list) matches = set() for n in range(3, 0, -1): # i indexes the same token in the text whether we're matching by unigram, bigram, or trigram for i, grams in enumerate(ngrams(tokens, n)): # if we already matched the current token to a topic, don't reprocess it if i in matches: continue # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown' gram = " ".join(grams) try: # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on topic_block = TOPIC_STEMS[gram[:4]] except KeyError: continue for topic in topic_block: # otherwise look for an inexact match match_ratio = StringMatcher(None, topic, gram).ratio() if match_ratio >= min_similarity: try: # if a 'primary label' exists for the current topic, use it instead of the matched topic topic = CSO['primary_labels'][topic] except KeyError: pass # note the tokens that matched the topic and how closely found_topics[topic].append({'matched': gram, 'similarity': match_ratio}) # don't reprocess the current token matches.add(i) return found_topics
def make_comparator(): matcher = StringMatcher() def compare(str1, str2): matcher.set_seqs(str1, str2) return matcher.distance() return compare
def statistic_similarity(self): """Function that finds the similarity between the previously extracted concepts and topics in the ontology Returns: found_topics (dictionary): containing the found topics with their similarity and the n-gram analysed. """ found_topics = dict() concepts = self.paper.get_syntactic_chunks() for concept in concepts: matched_trigrams = set() matched_bigrams = set() for comprehensive_grams in self.get_ngrams(concept): position = comprehensive_grams["position"] size = comprehensive_grams["size"] grams = comprehensive_grams["ngram"] # if we already matched the current token to a topic, don't reprocess it if size <= 1 and (position in matched_bigrams or position - 1 in matched_bigrams): continue if size <= 2 and (position in matched_trigrams or position - 1 in matched_trigrams or position - 2 in matched_trigrams): continue # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown' gram = " ".join(grams) try: # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on # topic_block = [key for key, _ in self.cso.topics.items() if key.startswith(gram[:4])] topic_block = self.cso.topic_stems[gram[:4]] except KeyError: continue for topic in topic_block: # otherwise look for an inexact match match_ratio = StringMatcher(None, topic, gram).ratio() if match_ratio >= self.min_similarity: try: # if a 'primary label' exists for the current topic, use it instead of the matched topic topic = self.cso.primary_labels[topic] except KeyError: pass # note the tokens that matched the topic and how closely if topic not in found_topics: found_topics[topic] = list() found_topics[topic].append({ 'matched': gram, 'similarity': match_ratio }) # don't reprocess the current token if size == 2: matched_bigrams.add(position) elif size == 3: matched_trigrams.add(position) # explanation bit if topic not in self.explanation: self.explanation[topic] = set() self.explanation[topic].add(gram) return found_topics
def find_matches(self, candidates, verbose=False): if verbose: print('Searching for matches') found_topics = dict() for candidate in candidates: matched_trigrams = set() matched_bigrams = set() for comprehensive_grams in self.__get_ngrams__(candidate): position = comprehensive_grams["position"] size = comprehensive_grams["size"] grams = comprehensive_grams["ngram"] # if we already matched the current token to a topic, don't reprocess it if size <= 1 and (position in matched_bigrams or position - 1 in matched_bigrams): continue if size <= 2 and (position in matched_trigrams or position - 1 in matched_trigrams or position - 2 in matched_trigrams): continue # otherwise unsplit the ngram for matching so ('quick', 'brown') => 'quick brown' gram = ' '.join(grams) if verbose: print(gram) try: # if there isn't an exact match on the first 4 characters of the ngram and a topic, move on #topic_block = [key for key, _ in self.cso.topics.items() if key.startswith(gram[:4])] topic_block = self.key_stems[gram[:4]] except KeyError: continue for topic in topic_block: # otherwise look for an inexact match match_ratio = StringMatcher(None, topic, gram).ratio() if match_ratio >= self.min_similarity: #try: # # if a 'primary label' exists for the current topic, use it instead of the matched topic # topic = self.cso.primary_labels[topic] #except KeyError: # pass # note the tokens that matched the topic and how closely if gram in found_topics: current_similarity = found_topics[gram][ 'similarity'] if current_similarity >= match_ratio: continue found_topics[gram] = { 'matched': topic, 'similarity': match_ratio } # don't reprocess the current token if size == 2: matched_bigrams.add(position) elif size == 3: matched_trigrams.add(position) return found_topics
def char_to_char(source: str, target: str) -> np.ndarray: """Find the character adjacency matrix mapping source string chars to target string chars. Uses StringMatcher from Levenshtein package to find non-overlapping matching subsequences in input strings. Uses the result to create a character adjacency matrix from source to target. (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher.get_matching_blocks) Args: source (str): string of source chars. target (str): string of target chars. Returns: np.ndarray adjacency matrix mapping chars in the source str to chars in the target str. """ sm = StringMatcher(seq1=source, seq2=target) mb = sm.get_matching_blocks() return _mat_from_blocks(mb, len(source), len(target))
def levenshtein_distance(statement, other_statement): """ Compare two statements based on the Levenshtein distance of each statement's text. For example, there is a 65% similarity between the statements "where is the post office?" and "looking for the post office" based on the Levenshtein distance algorithm. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) other_statement_text = unicode(other_statement.text.lower()) else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) # Calculate a decimal percent of the similarity percent = int(round(100 * similarity.ratio())) / 100.0 return percent
def levenshtein_distance(statement, other_statement): """ Compare two statements based on the Levenshtein distance of each statement's text. For example, there is a 65% similarity between the statements "where is the post office?" and "looking for the post office" based on the Levenshtein distance algorithm. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) other_statement_text = unicode(other_statement.text.lower()) else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent
def compute_levensthein_distances_in_clusters(output_file="", embedder="doc2vec"): # Computes the levensthein distances within clusters # # In: # embedder - "doc2vec" or "sbert" doc_labels, docs = load_data() if embedder == "doc2vec": l, x, labels = perform_clustering() elif embedder == "sbert": l, x, labels = sbert_labels() else: return print("wrong input to eval method") valid_labels = set(l) for lbl in valid_labels: if lbl == -1: continue print(lbl, end=",") # Create a subset of labels that belong to cluster lbl current_cluster = [] for k, v in labels.items(): if v == lbl: current_cluster.append(k) # breakpoint() for a, b in combinations(current_cluster, 2): idx_a = -1 idx_b = -1 for i in range(len(doc_labels)): if doc_labels[i].startswith(a): idx_a = i elif doc_labels[i].startswith(b): idx_b = i doc1 = docs[idx_a] doc2 = docs[idx_b] # print(doc1,doc2,"a b", a,b) # assert(doc1) # assert(doc2) m = StringMatcher(seq1=doc1, seq2=doc2) print(m.ratio(), end=",") print() return
def levenshtein_distance(statement, other_statement): """ Compare two statements based on the Levenshtein distance (fuzzy string comparison) of each statement's text. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys import warnings # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) other_statement_text = unicode(other_statement.text.lower()) else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) # Calculate a decimal percent of the similarity percent = int(round(100 * similarity.ratio())) / 100.0 return percent
def compare(self, statement, other_statement): """ 比较两个输入 :return: 返回两个句子之间的相似度 :rtype: 浮点型 """ # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher(None, statement_text, other_statement_text) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) # NOQA other_statement_text = unicode(other_statement.text.lower()) # NOQA else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent
def _search(self, artist, title=None): artist = self._remove_accents(artist.lower()) if title != None: search = urllib.parse.quote_plus("%s %s" % (artist, title)) else: search = urllib.parse.quote_plus(artist.encode('utf-8')) url = u'http://www.songlyrics.com/index.php?section=search&searchW=%s&submit=Search&searchIn1=artist&searchIn3=song' % search user_agent = "(X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36" headers = {'User-Agent': user_agent} req = Request(url, headers=headers) content = urlopen(req) page = lxml.html.parse(content) links = page.getroot().cssselect('.serpresult') resp = [] #search_original = u"%s - %s" % (artist.lower(), title.lower()) if len(links): for link in links: current_song = link.cssselect('h3>a') if len(current_song) and current_song[0].text != None: song_title = current_song[0].text.replace('Lyrics', '') song_artist = link.cssselect('.serpdesc-2>p>a')[0].text search = u"%s - %s" % (song_artist.lower(), song_title.lower()) dist_artist = StringMatcher( seq1=artist.lower(), seq2=song_artist.lower()).distance() dist_title = 0 if title != None: dist_title = StringMatcher( seq1=title.lower(), seq2=song_title.lower()).distance() #print (dist_artist, artist, song_artist) #print (title, dist_title) if dist_artist < 5 and (title != None or dist_title < 10): resp.append(current_song[0].get('href')) return resp
def get_loosly_matching_keyword(self, term): splitted_terms = self.tokenize_text(term) max_nb_commun = 0 most_common_keys = [] for key, key_tokens in self.tokenized_keys_.items(): nb_words_common = 0 for potential_word in splitted_terms: if potential_word in key_tokens: nb_words_common += 1 if nb_words_common > max_nb_commun: max_nb_commun = nb_words_common most_common_keys = [] if nb_words_common == max_nb_commun: most_common_keys.append(key) min_distance = 9999999 result = None for key in most_common_keys: match = StringMatcher(seq1=key, seq2=term) distance = match.distance() if distance < min_distance: min_distance = distance result = key return result
def find_closest_match(types_dict, search_str): closest_match = '' closest_distance = DEFAULT_DISTANCE for key in types_dict: key_distance = StringMatcher(seq1=search_str, seq2=key).distance() if key_distance < closest_distance: closest_match = key closest_distance = key_distance closest_type = types_dict[closest_match] return closest_match, closest_type
def get_artists(self, region, country, number_pages=4): country = country.lower() country_iso = pycountry.countries.get(name=country).alpha_2 artists = set() for page_number in range(1, number_pages + 1): url = "https://freemuse.org/regions" + "/" + region + "/" + country + "/page/" + str( page_number) page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') items = soup.find_all(class_="item-list") for item in items: entry_title = item.find(class_="post-box-title") entry = item.find(class_="entry") title = entry_title.find('a').getText() article = entry.find('p').getText() doc = self._nlp(f'{title}. {article}') # sometimes actually ORG is relevant too entities = [ x.text for x in doc.ents if ((x.label_ == 'PERSON') | (x.label_ == 'ORG')) ] if entities: for entity in set(entities): # cross check person with musicbrainz so we make sure the artist exists in the specified country # TODO: not very robust. find some way to double check try: query = musicbrainzngs.search_artists( artist=entity, country=country_iso) artist_in_musicbrainz = query['artist-list'][0][ 'name'] dist_artist = StringMatcher( seq1=artist_in_musicbrainz.lower(), seq2=entity.lower()).distance() if (dist_artist < 5 and query['artist-list'][0]['country'] == country_iso): artists.add(artist_in_musicbrainz) print(f'Success! Found artist {entity}') else: print( f'Found name {entity} but not in Musicbrainz, closest artists is {artist_in_musicbrainz}' ) except Exception as e: print(f'Something failed for {entity}', e) return ({country_iso: list(artists)})
def get_artists_from_freemuse(region, country): artists = set() # TODO: parse through all pages not just first one url = "https://freemuse.org/regions" + "/" + region + "/" + country page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') items = soup.find_all(class_="item-list") for item in items: entry_title = item.find(class_="post-box-title") entry = item.find(class_="entry") title = entry_title.find('a').getText() article = entry.find('p').getText() doc = nlp(f'{title}. {article}') # keep only persons # TODO: sometimes actually ORG is relevant too entities = [x.text for x in doc.ents if (x.label_ == 'PERSON')] # entities2 = [x.text for x in doc.ents if (x.label_ == 'ORG')] if entities: for entity in set(entities): # cross check person with musicbrainz so we make sure the artist exists in the specified country # TODO: not very robust. find some way to double check try: artist_in_musicbrainz = musicbrainzngs.search_artists( artist=entity, country=pycountry.countries.get( name=country).alpha_2)['artist-list'][0]['name'] dist_artist = StringMatcher( seq1=artist_in_musicbrainz.lower(), seq2=entity.lower()).distance() # A higher value will allow to match different artists, I think between 5 and 10 should be a good range. if dist_artist < 5: artists.add(artist_in_musicbrainz) print(f'Success! Found artist {entity}') else: print( f'Found name {entity} but not in Musicbrainz, closest artists is {artist_in_musicbrainz} with distance {dist_artist}' ) except: print(f'Something failed for {entity}') pass return (artists)
def get_requestable_from_sentence(sentence: str, requestable_dict): no_punctual_sentence = re.sub(r'[^\w\s]', '', sentence) lowercase_sentence = no_punctual_sentence.lower() split_sentence = re.split(r'\s+', lowercase_sentence) results = [] for word, requestable in requestable_dict.items(): if word in sentence: results.append((word, requestable)) if len(results) > 0: return results for word in split_sentence: closest_match, closest_requestable = find_closest_match( requestable_dict, word) word_distance = StringMatcher(seq1=word, seq2=closest_match).distance() if word_distance <= 2: results.append((closest_match, closest_requestable)) return results
def char_to_char(self, source: str, target: str) -> Matrix: # Run Levenshtein at character level. sm = StringMatcher(seq1=source, seq2=target) mb = sm.get_matching_blocks() return self._mat_from_blocks(mb, len(source), len(target))
def levenshtein_ratio(s1, s2): m = StringMatcher(None, s1, s2) return truncate(m.ratio(), 2), m.distance()
def is_typo(word, word_from_dict): sm = StringMatcher() sm.set_seq1(word) sm.set_seq2(word_from_dict) dist = sm.distance() return dist == 1 or (dist == 2 and fl(word, word_from_dict))
def getMatch(self, seq1, seq2): m = StringMatcher(seq1=seq2, seq2=seq1) ration = m.quick_ratio() ration = round(ration * 100, 2) self.scores.append((seq2, ration))
async def getClosestUser(ctx, user, return_member=False): # Thanks to discord's new intent system, this entire function is broken if you are not verified. # Because of this, we migrated back to discord.User, which is what this now returns. return user user = str(user).lower() matcher = StringMatcher() userdata = [] for _user in ctx.guild.members: member = { "name": _user.name, "fullname": _user.name + "#" + str(_user.discriminator), "mention": "<@!" + str(_user.id) + ">", "nickname": _user.display_name, "discriminator": str(_user.discriminator), "discrim2": "#" + str(_user.discriminator), "id": str(_user.id) } data = {} for item in member: matcher.set_seqs(user, member[item].lower()) data[item] = matcher.quick_ratio() data["_id"] = member["id"] userdata.append(data) matches = {} for user in userdata: for key in user: if key != "_id": if user[key] > .5: matches[user["_id"]] = user[key] break if not matches: raise ValueError(f"NoSuchUser: {user}") id = int(max(matches.items(), key=itemgetter(1))[0]) if not return_member: return bot.get_user(id) return ctx.guild.get_member(id)
def getMatchwithThreshold(self, seq1, seq2, score_cutoff=0): m = StringMatcher(seq1=seq2, seq2=seq1) ration = m.quick_ratio() ration = round(ration * 100, 2) if ration >= score_cutoff: self.scores.append((seq2, ration))
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ import sys from nltk import word_tokenize from chatterbot import utils logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') global counter #global model # Use python-Levenshtein if available try: from Levenshtein.StringMatcher import StringMatcher as SequenceMatcher except ImportError: from difflib import SequenceMatcher PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement or not other_statement: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.lower()) other_statement_text = unicode(other_statement.lower()) else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) counter += 1 #print "calculating similarity ****************************************************************************",counter # Calculate a decimal percent of the similarity percent = int(round(100 * similarity.ratio())) / 100.0 sentence_1 = clean_sent(statement_text).lower().split() sentence_2 = clean_sent(other_statement_text).lower().split() tokens1 = (sentence_1) tokens2 = (sentence_2) # Remove all stop words from the list of word tokens s1 = utils.remove_stopwords(tokens1, language='english') s2 = utils.remove_stopwords(tokens2, language='english') #s1 = [w for w in sentence_1 if w not in stop_words] #s2 = [w for w in sentence_2 if w not in stop_words] distance = model.wmdistance(s1, s2) distance_gensim = model.wmdistance(s1, s2) if distance == infinity: return percent elif percent > distance: if percent - distance < 0.25: #print other_statement_text, percent + 0.08, '%', '***DECENT MATCH****' #print 'percent: ', percent, 'distance: ', distance #print return percent + 0.08 + (0.15 * abs(1 - distance)) else: #print other_statement_text, '*****CLOSE MATCH*****' #print 'percent: ', percent, 'distance: ', distance #print return percent + 1.0 + (0.15 * abs(1 - distance)) elif percent > 0.4: if distance - percent < 0.15: #print other_statement_text, percent + 0.06, '%' #print 'percent: ', percent, 'distance: ', distance #print return percent + 0.06 + (0.15 * abs(1 - distance)) else: #print other_statement_text, percent - 0.04, '%' #print 'percent: ', percent, 'distance: ', distance #print return (percent - 0.04) - (0.15 * abs(1 - distance))
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) # NOQA other_statement_text = unicode(other_statement.text.lower()) # NOQA else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher( None, statement_text, other_statement_text ) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ # Return 0 if either statement has a falsy text value # if not statement.text or not other_statement.text: # return 0 # # statement_text = str(statement.text.lower()) # other_statement_text = str(other_statement.text.lower()) if not statement or not other_statement: return 0 statement_text = str(statement.lower()) other_statement_text = str(other_statement.lower()) similarity = SequenceMatcher(None, statement_text, other_statement_text) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 4) return percent
def compare(self, statement, other_statement): """ Compare the two input statements. :return: The percent of similarity between the text of the statements. :rtype: float """ PYTHON = sys.version_info[0] # Return 0 if either statement has a falsy text value if not statement.text or not other_statement.text: return 0 # Get the lowercase version of both strings if PYTHON < 3: statement_text = unicode(statement.text.lower()) # NOQA other_statement_text = unicode( other_statement.text.lower()) # NOQA else: statement_text = str(statement.text.lower()) other_statement_text = str(other_statement.text.lower()) similarity = SequenceMatcher(None, statement_text, other_statement_text) # Calculate a decimal percent of the similarity percent = round(similarity.ratio(), 2) return percent