def compare_cpe_packages_fuzzy(CVE_SEARCH_JSON_OUTPUT_PATH , deps_path): vendorproductdict = obtain_vendor_product_dict(CVE_SEARCH_JSON_OUTPUT_PATH ) packageappdict = obtain_package_app_dict(deps_path) for vendor, products in vendorproductdict.items(): for package, apps in packageappdict.items(): for product in products: for app in apps: logger.debug('comparing vendor %s with package %s', vendor, package) ratio = fuzz.token_sort_ratio(package, vendor) if ratio > THRESHOLD: logger.info('package %s and vendor %s has ratio %s', package, vendor, ratio) logger.debug('comparing vendor %s with app %s', vendor, app) ratio = fuzz.token_sort_ratio(app, vendor) if ratio > THRESHOLD: logger.info('app %s and vendor %s in has ratio %s', app, vendor, ratio) logger.debug('comparing product %s with package %s', product, package) ratio = fuzz.token_sort_ratio(package, product) if ratio > THRESHOLD: logger.info('package %s and product %s has ratio %s', package, product, ratio) logger.debug('comparing product %s with app %s', product, app) ratio = fuzz.token_sort_ratio(app, product) if ratio > THRESHOLD: logger.info('app %s and product %s has ratio %s', app, product, ratio)
def getId(self, title): apiArgs = {'api_key' : self.api_key, 'query' : title} query = API_URL + self.api_search + "?" + urlencode(apiArgs) apiRequest = Request(query, headers=HEADERS) result = urlopen(apiRequest).read() data = json.loads(result) movieId = None found = {} alt = {} for i in data['results']: if i is None: continue if fuzz.token_sort_ratio(title, i[self.title]) == 100: movieId = str(i['id']) found[movieId] = {'title' : i[self.title], 'date' : i[self.date]} elif fuzz.token_sort_ratio(title, i[self.title]) > 85 and fuzz.partial_ratio(title, i[self.title]) > 90: altId = str(i['id']) alt[altId] = {'title' : i[self.title], 'date' : i[self.date]} if len(found) == 1: return movieId elif len(found) > 1: print "DUPLICATES FOUND, ENTER THE ID OR -1 TO SKIP" movieId = self.movieSelect(found) elif len(alt) > 0: print "ALTERNATES FOUND, ENTER THE ID OR -1 TO SKIP" movieId = self.movieSelect(alt) return movieId
def get_CUL_score(record_elems, resp_elems): if record_elems is None or resp_elems is None: return None elif isinstance(record_elems, str) and isinstance(resp_elems, str): score = str(fuzz.token_sort_ratio(record_elems, resp_elems)) return score elif isinstance(record_elems, str) and not isinstance(resp_elems, str): scores = [] for n in range(len(resp_elems)): score = str(fuzz.token_sort_ratio(record_elems, resp_elems[n])) scores.append(score) return max(scores) elif not isinstance(record_elems, str) and isinstance(resp_elems, str): scores = [] for n in range(len(record_elems)): score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems)) scores.append(score) return max(scores) elif not isinstance(record_elems, str) and not isinstance(resp_elems, str): scores = [] for n in range(len(record_elems)): for m in range(len(resp_elems)): score = str(fuzz.token_sort_ratio(record_elems[n], resp_elems[m])) scores.append(score) if scores != []: return max(scores) else: return None
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def compare_to_queue(queue, head, ratio, arguments): """Compare current title to all in queue.""" for item in queue: if fuzz.token_sort_ratio(item, head.title) > ratio: if arguments.verbose: print_time_message(arguments, "### Old title: " + item) print_time_message(arguments, "### New: " + head.feed_title + ": " + head.title) print_time_message(arguments, "### Ratio:" + str(fuzz.token_sort_ratio(item, head.title))) return fuzz.token_sort_ratio(item, head.title) return 0
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Token Sort Ratio algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(fuzz.token_sort_ratio(string_a, string_b)) else: return fuzz.token_sort_ratio(string_a, string_b) else: raise TypeError
def process_group( data, group, toc, toc_table, page_num, section, sectionid, html): """Retreives a group from the full data, and creates toc stuff Args: data (List): Full set of data containing all hosts group (String): String representing group to process toc (String): HTML for Table of Contents toc_table (String): HTML for Table in ToC page_num (int): Page number we're on in the report section (String): Display name of the group sectionid (String): Unique ID for ToC navigation html (String): HTML for current page of report Returns: List: Elements for category sorted and grouped String: HTML representing ToC String: HTML representing ToC Table String: HTML representing current report page """ group_data = sorted([x for x in data if x.category == group], key=lambda (k): k.page_title) grouped_elements = [] if len(group_data) == 0: return grouped_elements, toc, toc_table, html if page_num == 0: toc += ("<li><a href=\"report.html#{0}\">{1} (Page 1)</a></li>").format( sectionid, section) else: toc += ("<li><a href=\"report_page{0}.html#{1}\">{2} (Page {0})</a></li>").format( str(page_num+1), sectionid, section) html += "<h2 id=\"{0}\">{1}</h2>".format(sectionid, section) unknowns = [x for x in group_data if x.page_title == 'Unknown'] group_data = [x for x in group_data if x.page_title != 'Unknown'] while len(group_data) > 0: test_element = group_data.pop(0) temp = [x for x in group_data if fuzz.token_sort_ratio( test_element.page_title, x.page_title) >= 70] temp.append(test_element) temp = sorted(temp, key=lambda (k): k.page_title) grouped_elements.extend(temp) group_data = [x for x in group_data if fuzz.token_sort_ratio( test_element.page_title, x.page_title) < 70] grouped_elements.extend(unknowns) toc_table += ("<tr><td>{0}</td><td>{1}</td>").format(section, str(len(grouped_elements))) return grouped_elements, toc, toc_table, html
def response_correct(response, answer): filtered_response = filter_words(response) filtered_answer = filter_words(answer) bracketless = strip_brackets(filtered_answer) no_whitespace_response = filtered_response.replace(" ", "") no_whitespace_answer = filtered_answer.replace(" ", "") no_whitespace_bracketless = bracketless.replace(" ", "") score = max( fuzz.token_sort_ratio(filtered_response, filtered_answer), fuzz.token_sort_ratio(filtered_response, bracketless), fuzz.ratio(no_whitespace_response, no_whitespace_answer), fuzz.ratio(no_whitespace_response, no_whitespace_bracketless) ) return score > 70
def search_OCLC(self): # since this API does not need a query_type, this variable was used to pass the title to the object # dictionary for storing scores for this API self.scores = {} #import your OCLC develpoer key wskey = keys['OCLC-wskey'][0] # The API call url OCLC = "http://www.worldcat.org/webservices/catalog/search/worldcat/opensearch?q=" + self.query_type + "&wskey=%s" %(wskey) try: OCLC_result = requests.get(OCLC).text # having issues reading the response object. Work around: Write to a file and then read -- the file will be deleted at the end of the process with open("temp-file.xml", "w") as file: file.write(OCLC_result) file.close() file = ETree.parse("temp-file.xml") root = file.getroot() # iterate over the root element and get "title", "author" (name), and "id" (worldcat ID) for each entry for i in root.iter('{http://www.w3.org/2005/Atom}entry'): author = i.find('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name').text title = i.find('{http://www.w3.org/2005/Atom}title').text id = i.find('{http://www.w3.org/2005/Atom}id').text # if title was a match (>%95) and author was a match also, then start the process for getting the work_id scoreTitle = fuzz.token_sort_ratio(title, self.query_type) if scoreTitle > self.th: scoreOCLC = fuzz.token_sort_ratio(author, self.name) if scoreOCLC > self.th: # use this score as the average score score = (scoreTitle + scoreOCLC)/2 work_id = '' # get the worldcat ID wid = id.replace('http://worldcat.org/oclc/', '') # store the worldcat ID in the dict -- this ID is not used for enrichment at this point self.scores['OCLC'] = {} self.scores['OCLC']['oclcid'] = {} self.scores['OCLC']['oclcid'][wid] = [self.query_type, scoreTitle] # create the query url and send it to worldcat to get back the JSON-LD workid = 'http://experiment.worldcat.org/oclc/' + wid + '.jsonld' # decode the JSON OCLC_res = requests.get(workid).json() # iterate over the JSON graph and find work_id for i in OCLC_res['@graph']: if 'exampleOfWork' in i.keys(): work_id = i['exampleOfWork'] if work_id != '': self.scores['OCLC']['work_id'] = {} self.scores['OCLC']['work_id'][work_id] = [self.query_type, score] except: PrintException(self.log_file, self.name) if len(self.scores) > 0: return (self.scores)
def fw_token_sort_ratio(question1, question2): fuzzy = [] for q1, q2 in zip(question1, question2): partial_ratio = fuzz.token_sort_ratio(str(q1), str(q2)) / 100 fuzzy.append([partial_ratio]) print("Created fuzz token_sort_ratio feature") return np.array(fuzzy)
def fuzzyMatcher(self): """ Fuzzy matching logic, returns two files with results """ selectSize = len(self.selection) for s in self.selection: #incCounter() sRow, sList, sCode = s for c in self.comparison: #incCounter() cRow, cList, cCode = c scoreValue = fuzz.token_sort_ratio(sList, cList) dataSet = [sRow, sList, sCode, cRow, cList, cCode, scoreValue] if scoreValue >= self.ratio: #print('Hit: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue)) self.match_exact.append(dataSet) if scoreValue < self.ratio and scoreValue > self.min_ratio: #print('Fuzzy: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue)) self.match_fuzzy.append(dataSet) """ Don't use this unless you want a result set equal to selection * comparison!!! """ #if scoreValue < self.min_ratio: #print('No Match: Select row %s on Compare row %s with score of %s' %(sRow, cRow, scoreValue)) # self.match_none.append(dataSet) status = round( ((sRow / selectSize) * 100), 0) print('Row %s of %s - Percentage complete - %s' %(sRow, selectSize, status) + '%') self.csv_writer() return self.match_exact, self.match_fuzzy ##, self.match_none
def sentenceCorrector(sentence): '''Fucntion to correct the english text using fuzzy logic Return Value = String (Corrected sentence) ''' sentence = "".join(sentence) #print "SENTENCE: %s" % type(sentence) myWord = sentence.split(",") #print myWord for i in range(0, len(myWord)): check = wordCheck(myWord[i]) if check == True: pass else : myArray = myDict.suggest(myWord[i]) #print myArray tokenSetRatioArray = [] maxProb = 0 index = 0 for j in range(0,len(myArray)): tokenSetRatioArray.append(fuzz.token_sort_ratio(myWord[i],myArray[j])) if (maxProb < tokenSetRatioArray[j]): maxProb = tokenSetRatioArray[j] index = j; #print "Index:" + str(index) + "maxProb:" + str(maxProb) + "index:" + str(index) + "i:" + str(i) if tokenSetRatioArray: myWord[i] = myArray[index]; #print myWord #print " ".join(myWord) return " ".join(myWord)
def fuzzer(localstring, dbpstring): lwl = localstring.replace('-','').replace(',.', '').split() lfwl = [w for w in lwl if not w in stopwords.words('english')] dwl = dbpstring.replace('-','').split() dfwl = [w for w in dwl if not w in stopwords.words('english')] ratio = fuzz.token_sort_ratio(str(lfwl), str(dfwl)) return ratio
def getRatio(var1, var2, alg): r1test = 40 r2test = 100 r3test = 100 r4test = 90 # 85 is probably too low --- too many FP # let's keep alg as a dummy, but it may be unimportant # it seems that the quality of results can be improved if two (or) # -- more results are correlated: [1] can be lowered as long as [4] remains high r1 = fuzz.ratio(var1,var2) r2 = fuzz.partial_ratio(var1,var2) r3 = fuzz.token_sort_ratio(var1,var2) r4 = fuzz.token_set_ratio(var1,var2) if r1 >= r1test: if r4 >= r4test: ratio = 100 #reportRatio(var1, var2) else: ratio = 0 else: ratio = 0 return(ratio)
def fuzz_comparisons(x): out = {} out['fuzz_partial_ratio'] = fuzz.partial_ratio(*x) out['fuzz_ratio'] = fuzz.ratio(*x) out['fuzz_token_sort_ratio'] = fuzz.token_sort_ratio(*x) out['fuzz_token_set_ratio'] = fuzz.token_set_ratio(*x) return pd.Series(out)
def compare_to(self, name): """ Compares the name object given to itself. :param name: The name object to be compared. :return: The ratio between them from 0-100. """ return fuzz.token_sort_ratio(name.get_full_name(0), self.get_full_name(0))
def fuzzy_token_sort_ratio_check(full_name_check_value, name_one, name_two): """ Runs a fuzzy token sort ratio check if the record hasn't passed either a full name or name with initial check """ if full_name_check_value == 0: return fuzz.token_sort_ratio(name_one, name_two) return 0
def check_translation(self, translation): """ Takes a `translation` and returns `True` if it's correct. """ return any( fuzz.token_sort_ratio(translation, t.translation) >= 90 for t in self.translations )
def GetMatchingFunds(in_fund,df): is2fuzzy = 0 funds = [] if len(in_fund.split()) != 2: funds = df[df['company_name'].apply( lambda x: (in_fund.lower() in x.lower()) )]['company_name'].tolist() if funds == [] and len(in_fund.split())==2: in_fund = in_fund.split() funds = df[df['company_name'].apply( lambda x: (in_fund[0].lower() in x.lower() and in_fund[1].lower() in x.lower()) )]['company_name'].tolist() if funds and len(funds) < 6: return is2fuzzy, funds from fuzzywuzzy import fuzz from fuzzywuzzy import process funds = df[df['company_name'].apply( lambda x: ( fuzz.token_sort_ratio(x, in_fund) > 50 and fuzz.token_set_ratio(x, in_fund) > 65 ) )]['company_name'].tolist() if not funds or len(funds) >= 10: is2fuzzy = 1 funds = df[df['company_name'].apply( lambda x: ( fuzz.token_sort_ratio(x, in_fund) > 40 and fuzz.token_set_ratio(x, in_fund) > 49 ) )]['company_name'].tolist() return is2fuzzy, funds
def compare_hunks(left, right): # This case happens for example, if both hunks remove empty newlines # This check is _required_ as fuzzywuzzy currently contains a bug that # does misevaluations in case of equivalence. See # https://github.com/seatgeek/fuzzywuzzy/issues/196 if left == right: return 100 return fuzz.token_sort_ratio(left, right)
def getShowTime(showName): for item in showTimeDictRev.keys(): if (fuzz.partial_ratio(what,item) == 100) or (fuzz.token_sort_ratio(what,item) == 100) or (fuzz.token_set_ratio(what,item) == 100): print (showTimeDictRev[item]) print (channelsDictSky[item]) print (channelsDictSkyRev[(channelsDictSky[item])]) print (allChannelsRev[channelsDictSky[item]])
def is_same_entity(bentley_term, lc_term, type_): if "geogname" in type_: similarity = fuzz.token_sort_ratio(bentley_term, lc_term) return similarity > 95 elif "corpname" in type_: bentley_term = bentley_term.replace("U.S.", "United States") lc_term = lc_term.replace("U.S.", "United States") bentley_term = bentley_term.replace("N.Y.", "New York") lc_term = lc_term.replace("N.Y.", "New York") if "." in bentley_term.strip("."): similarity = fuzz.ratio(bentley_term, lc_term) else: similarity = fuzz.WRatio(bentley_term, lc_term) # print("{0}: {1} <--> {2}".format(similarity, original_term, returned_term)) return similarity >= 90 elif "persname" in type_: bias = 0 date_regex = r"(\d{4})\-((?:\d{4})?)" bentley_dates = re.findall(date_regex, bentley_term) lc_dates = re.findall(date_regex, lc_term) if len(bentley_dates) > 0 and len(lc_dates) > 0: birthdate_bentley, deathdate_bentley = bentley_dates[0] birthdate_lc, deathdate_lc = lc_dates[0] if birthdate_bentley != birthdate_lc: bias -= 100 if birthdate_bentley == birthdate_lc and deathdate_bentley == deathdate_lc: bias += 100 if birthdate_bentley == birthdate_lc and deathdate_lc and not deathdate_bentley: lc_term = lc_term.replace(deathdate_lc, "") bias += 25 similarity = fuzz.token_sort_ratio(bentley_term, lc_term) + bias # print("{0}: {1} <--> {2}".format(similarity, bentley_term, lc_term)) return similarity >= 95
def preevaluate_filenames(thresholds, right_files, left_file): # We won't enter preevaluate_filenames, if tf >= 1.0 candidates = [] for right_file in right_files: sim = fuzz.token_sort_ratio(left_file, right_file) / 100 if sim < thresholds.filename: continue candidates.append(right_file) return left_file, candidates
def similarity(s, t, method): if method == "partial": return fuzz.partial_ratio(s, t) elif method == "token_sort": return fuzz.token_sort_ratio(s, t) elif method == "token_set": return fuzz.token_set_ratio(s, t) else: return fuzz.ratio(s, t)
def fuzzy_search(song): tracks = client.get('tracks', q=song) maxi = 0 best = '' for track in tracks: match = fuzz.token_sort_ratio(track.title, song) if match>maxi: best = track.permalink_url maxi = match return best
def process_cv(extracted_resumes, key_multipler, job_description): result_list = [] for resume in extracted_resumes: key_checked = [] title_count, skill_count, generic_count = 0,0,0 for multipler in key_multipler.keys(): # Matching job title if resume.has_key(header_title) and multipler == header_title: # matching first level title if fuzz.partial_ratio(job_description[header_title], resume[header_title]) > 80: title_count += key_multipler[multipler] # recurse in experience if resume.has_key(header_experience): for experience in resume[header_experience]: if experience.has_key(header_title): if fuzz.partial_ratio(job_description[header_title], experience[header_title]) > 80: title_count += key_multipler[multipler] # Matching skills elif resume.has_key(header_skill) and multipler == header_skill: skill_count += recurse_obj(resume[multipler], job_description[multipler], multipler) * key_multipler[multipler] elif resume.has_key(multipler) and multipler not in key_checked: key_checked.append(multipler) if isinstance(resume[multipler], list) and isinstance(resume[multipler][0], basestring): generic_count += recurse_obj(resume[multipler], job_description[multipler], multipler) * key_multipler[multipler] elif isinstance(resume[multipler], basestring): if fuzz.token_sort_ratio(resume[multipler], job_description[multipler]) > 90: generic_count += key_multipler[multipler] elif fuzz.token_sort_ratio(resume[multipler], job_description[multipler]) > 60: generic_count += key_multipler[multipler] * 0.5 score = title_count+skill_count+generic_count result_list.append({'Name': resume['Name'], 'Score': round(score, 2)}) # Sort by score result_list = (sorted(result_list, key=lambda t: t.get('Score', 0), reverse=True)) return result_list
def UpdateCursor3(infile,test): # Update the spatial join shapefile for the different match cases. with arcpy.da.UpdateCursor(infile,test) as cursor: #Update cursor. for row in cursor: #row[2]=row[0].area #row[3]=row[0].length if row[8]!=-1: if row[9] !=-1: row[10]=1 denominatorArea=(row[4]+row[5])/2 #print denominatorArea row[6]=row[2]/((row[4]+row[5])/float(2)) row[7]=row[2]/((row[5]+row[4])/float(2)) row[13]=row[3]/((row[11]+row[12])/float(2)) row[14]=row[3]/((row[11]+row[12])/float(2)) #print("{0}, {1}, {2},{3}".format(row[6], row[7], row[13],row[14])) if infile==DissolveUnion: if len(row[15]) > len(row[16]): row[15],row[16]=row[15],row[16] distances = range(len(row[15]) + 1) for index2,char2 in enumerate(row[16]): newDistances= [index2+1] for index1,char1 in enumerate(row[15]): if char1 == char2: newDistances.append(distances[index1]) else: newDistances.append(1 + min((distances[index1], distances[index1+1], newDistances[-1]))) distances = newDistances #print distances Ldistance=distances[-1] s= SequenceMatcher(None,row[15],row[16]) stringRatio=s.ratio() #print stringRatio row[17]=Ldistance row[18]=stringRatio StringRat=fuzz.ratio(row[15], row[16]) row[19]=StringRat/float(100) stringPartialRatio=fuzz.partial_ratio(row[15], row[16]) row[20]=stringPartialRatio/float(100) StringTokenSort=fuzz.token_sort_ratio(row[15], row[16]) row[21]=StringTokenSort/float(100) stringTokenSet=fuzz.token_set_ratio(row[15], row[16]) row[22]=stringTokenSet/float(100) Average=(StringRat+stringPartialRatio+StringTokenSort+stringTokenSet)/4 row[23]=Average/float(100) WeightedAverage=((.10*StringRat)+(.30*stringPartialRatio)+(.30*StringTokenSort)+(.30*stringTokenSet))/4 row[24]= WeightedAverage/float(100) #print Average,WeightedAverage #print ("{0},{1},{2}".format(row[15],row[16],row[18])) cursor.updateRow(row)
def test_service_metadata(self): self.maxDiff = None response = self.client.get('/api/1.0/refine/reconcile', {'callback': 'jsonp123'}) self.assertEqual(200, response.status_code) self.assertEqual(100, fuzz.token_sort_ratio( 'jsonp123({"name": "Influence Explorer Reconciliation3", "identifierSpace": "http://staging.influenceexplorer.com/ns/entities", "schemaspace": "http://staging.influenceexplorer.com/ns/entity.object.id", "view": { "url": "http://staging.influenceexplorer.com/entity/{{id}}" }, "preview": { "url": "http://staging.influenceexplorer.com/entity/{{id}}", "width": 430, "height": 300 }, "defaultTypes": []})', response.content ) )
def match(song, gdic): ftype = song[song.rfind('.'):].lower() try: if ftype == ".mp3": smp = MP3(song) elif ftype == ".wma": print("wma") return "False" elif ftype == ".flac": smp = FLAC(song) elif ftype == ".ogg": print("ogg") return "False" elif ftype in (".mp4", ".m4a"): smp = MP4(song) else: return False except IOError: return "delete" if ftype == ".flac": name = smp['title'][0] artist = smp['artist'][0] album = smp['album'][0] elif ftype == ".m4a": name = smp['\xa9nam'][0] artist = smp['\xa9ART'][0] album = smp['\xa9alb'][0] else: try: name = smp["TIT2"].pprint()[5:].replace('[','(').replace(']',')') artist = smp["TPE1"].pprint()[5:].replace("Feat", "Featuring").replace("Andre 3000", "OutKast").replace("Big Boi", "OutKast") album = smp["TALB"].pprint()[5:] except KeyError: return False pmatch = [i for i in gdic if fuzz.token_set_ratio(name, i['title']) > 90] if len(pmatch) == 1: return pmatch[0] pmatch = [i for i in pmatch if fuzz.token_set_ratio(artist, i['artist']) > 90] if len(pmatch) == 1: return pmatch[0] pmatch = [i for i in pmatch if fuzz.token_set_ratio(album, i['album']) > 90] if len(pmatch) == 1: return pmatch[0] #pmatch = [i for i in pmatch if ((('(' not in name) and ('(' not in i['title'])) or ((('(' in name) and ('(' in i['title'])) and (name[name.rindex("(") + 1:name.rindex(")")].lower() == i['title'][i['title'].rindex("(") + 1:i['title'].rindex(")")].lower())))] pmatch = [i for i in gdic if fuzz.token_sort_ratio(name, i['title']) > 90] if len(pmatch) == 1: return pmatch[0] #print ([(i['title'], i['artist'], i['album'], i['durationMillis']) for i in pmatch]) pmatch = [i for i in pmatch if abs(smp.info.length * 1000 - int(i['durationMillis'].encode('utf-8'))) < 1000] if len(pmatch) == 1: return pmatch[0] else: #print(name, artist, album, smp.info.length * 1000) return False
def fuzz_method(linkedin_words, twitter_words): counter = 0 temp_arr = [] for t in twitter_words: for l in linkedin_words: temp_arr.append(fuzz.token_sort_ratio(t,l)) if fuzz.token_sort_ratio(t,l) > 70 and fuzz.token_sort_ratio(t,l) <= 80: counter+= 215 if fuzz.token_sort_ratio(t,l) > 60 and fuzz.token_sort_ratio(t,l) <= 70: counter+= 125 if fuzz.token_sort_ratio(t,l) > 50 and fuzz.token_sort_ratio(t,l) <= 60: counter+= 80 if fuzz.token_sort_ratio(t,l) > 40 and fuzz.token_sort_ratio(t,l) <= 50: counter+= 30 #if fuzz.token_sort_ratio(fullName,query) > 80: #counter += 700 return counter
def token_sort_ratio(plans: list, post, threshold=50): """ Match plans based on hardcoded plan topics, using fuzzywuzzy's token_sort_ratio for fuzzy matching """ match_confidence = 0 match = None for plan in plans: plan_match_confidence = fuzz.token_sort_ratio( post.text.lower(), plan["topic"].lower() ) if plan_match_confidence > match_confidence: # Update match match_confidence = plan_match_confidence match = plan return { "match": match["id"] if match_confidence > threshold else None, "confidence": match_confidence, "plan": match, }
async def answer(message: Message): user_answer = message.text[1:] answer = await db.check_answer() user_id = message.from_user.id chat_id = message.chat.id text1 = "Вы уже отвечали правильно." text2 = f" Правильный ответ: {answer}.\nВаш баланс не изменился." text3 = "Неверно" is_answered = await db.check_points(False) await bot.delete_message(chat_id, message.message_id) username = message.from_user.username full_name = message.from_user.full_name name = username if username is None: name = full_name if is_answered: await bot.send_message(user_id, text1 + text2) elif (fuzz.ratio(user_answer.lower(), answer) >= 80 or fuzz.token_sort_ratio(user_answer.lower(), answer) >= 80): await db.add_points(1) await bot.send_message(chat_id, f"@{name}, вы ответили верно") else: await bot.send_message(user_id, text3)
def extract_xslx_data(active_cells): for row in active_cells: for cell in row: #### Searching for Revenue(int or float) and Checks number(int or float) if isinstance(cell.value, int) or isinstance(cell.value, float): for apteka in apteki.keys(): if 'revenue_coordinate' in apteki[apteka]: if cell.coordinate[1:] == apteki[apteka][ 'revenue_coordinate'][1:]: apteki[apteka]['revenue'] = cell.value if 'checks_coordinate' in apteki[apteka]: if cell.coordinate[1:] == apteki[apteka][ 'checks_coordinate'][1:]: apteki[apteka]['number_of_checks'] = cell.value ##### Searching for date str if isinstance(cell.value, str): if fuzz.token_sort_ratio('00.00.2000', cell.value) >= 45: try: if cell.value[2] and cell.value[5] == '.': the_date.append(cell.value) except IndexError: pass
def get_playlist(album): album_string = album.artist + " " + album.name playlist_search_request = requests.get( "https://www.googleapis.com/youtube/v3/search?part=snippet&type=playlist&q=" + album_string + "&key=" + youtube_key) playlist_search_dict = json.loads(playlist_search_request.text) playlists_list = [] for item in playlist_search_dict['items']: id = item['id']['playlistId'] title = item['snippet']['title'] if fuzz.token_sort_ratio(album_string, title) > 80: playlists_list.append([id, title]) skip_playlist = False try: playlist_vids_list = list_playlist_vids(playlists_list[0][0]) except IndexError: skip_playlist = True album_tracklist = Song.objects.filter(album=album) if not skip_playlist: for vid in playlist_vids_list: vid_num = vid[0] for song in album_tracklist: track_num = song.track_num if vid_num == track_num: if compare_song_vid(song, vid): song.save() still_no_vid = album_tracklist.filter(youtube_link=0) for song in still_no_vid: song.youtube_link = keyword_search(song) song.save() return playlists_list
def wightedAverage(anime1, anime2): totWeight = (5 + 5 + 1 + 1 + 3 + 3) acmSim = 0 if (re.search(anime1.name, anime2.name, re.IGNORECASE) or re.search(anime2.name, anime1.name, re.IGNORECASE)): acmSim = 100 else: acmSim += fuzz.token_sort_ratio(anime1.name, anime2.name) * 5 acmSim += fuzz.token_sort_ratio(anime1.genre, anime2.genre) * 5 acmSim += fuzz.token_sort_ratio(anime1.media, anime2.media) * 1 acmSim == fuzz.token_sort_ratio(anime1.episodes, anime2.episodes) * 1 acmSim += fuzz.token_sort_ratio(anime1.rating, anime2.rating) * 3 acmSim += fuzz.token_sort_ratio(anime1.views, anime2.views) * 3 return (acmSim / totWeight * 1.0)
def search(query,type): """ Do a fuzzy match on Journals in MongoDB journals collection, returning results in Refine reconciliation API format. The type parameter determines if the match is on main_title or abbreviation For now, only exact matches are automatically matched, but this can be adjusted. """ out = [] query = query.lower() for item in journal_data: id_journal = item.get("id_journal","no_id") titleOrAbbrev = str(item.get(type,"nothing_found")) if titleOrAbbrev.lower() == query: match = True else: match = False #Construct a score using FuzzyWuzzy's token set ratio. #https://github.com/seatgeek/fuzzywuzzy score = fuzz.token_sort_ratio(query, titleOrAbbrev) out.append({ "id": id_journal, "name": titleOrAbbrev, "score": score, "match": match, "type": [ { "id": "http://purl.org/ontology/bibo/Periodical", "name": "bibo:Periodical", } ] }) #Sort this list by score sorted_out = sorted(out, key=itemgetter('score'), reverse=True) return sorted_out[:10]
def run(query): for a in scan(client, index=config['symbology']['index'], query=query): res = client.search(index=config['suspension']['index'], body={ "_source" : ["company", "date", "link"], "query" : { "match" : { "company" : a['_source']['name'] } } }) if res['hits']['total'] > 0: mtc = res['hits']['hits'][0]['_source'] sym_name = a['_source']['name'].lower() halt_name = mtc['company'].lower() x = fuzz.token_sort_ratio(sym_name, halt_name) y = fuzz.ratio(sym_name, halt_name) halts = {"match_attempted" : True} if res['hits']['hits'][0]['_score'] >= 1 and x >= 90): halts.update(mtc) halts.update({ "fuzz_ratio" : y, "fuzz_token_sort_ratio" : x, "match_score" : a['_score'] }) yield { "_id" : a['_id'], "_type" : config['symbology']['_type'], "_index" : config['symbology']['index'], "_op_type" : "update", "doc" : { "__meta__" : { "halts" : halts } } }
def extract_features(dfx): # preprocessing each question dfx["question1"] = dfx["question1"].fillna("").apply(preprocess) dfx["question2"] = dfx["question2"].fillna("").apply(preprocess) print("token features...") # Merging Features with dataset token_features = dfx.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) dfx["cwc_min"] = list(map(lambda x: x[0], token_features)) dfx["cwc_max"] = list(map(lambda x: x[1], token_features)) dfx["csc_min"] = list(map(lambda x: x[2], token_features)) dfx["csc_max"] = list(map(lambda x: x[3], token_features)) dfx["ctc_min"] = list(map(lambda x: x[4], token_features)) dfx["ctc_max"] = list(map(lambda x: x[5], token_features)) dfx["last_word_eq"] = list(map(lambda x: x[6], token_features)) dfx["first_word_eq"] = list(map(lambda x: x[7], token_features)) dfx["abs_len_diff"] = list(map(lambda x: x[8], token_features)) dfx["mean_len"] = list(map(lambda x: x[9], token_features)) #Computing Fuzzy Features and Merging with Dataset # do read this blog: http://chairnerd.seatgeek.com/fuzzywuzzy-fuzzy-string-matching-in-python/ # https://stackoverflow.com/questions/31806695/when-to-use-which-fuzz-function-to-compare-2-strings # https://github.com/seatgeek/fuzzywuzzy print("fuzzy features..") dfx["token_set_ratio"] = dfx.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) # The token sort approach involves tokenizing the string in question, sorting the tokens alphabetically, and # then joining them back into a string We then compare the transformed strings with a simple ratio(). dfx["token_sort_ratio"] = dfx.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) dfx["fuzz_ratio"] = dfx.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) dfx["fuzz_partial_ratio"] = dfx.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) dfx["longest_substr_ratio"] = dfx.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return dfx
def _calculate_ratios( self, message: Message, member: Member, guild: Guild, ) -> None: """ Calculates a messages relation to other messages """ for message_obj in member.messages: # This calculates the relation to each other if message == message_obj: raise DuplicateObject elif (self.options(guild).per_channel_spam and message.channel_id != message_obj.channel_id): # This user's spam should only be counted per channel # and these messages are in different channel continue elif (fuzz.token_sort_ratio(message.content, message_obj.content) >= self.options(guild).message_duplicate_accuracy): """ The handler works off an internal message duplicate counter so just increment that and then let our logic process it later """ self._increment_duplicate_count(member, guild, channel_id=message.channel_id) message.is_duplicate = True message_obj.is_duplicate = True if (self._get_duplicate_count( member, channel_id=message.channel_id, guild=guild) >= self.options(guild).message_duplicate_count): break
def calculate_distance(str1, str2): uni1 = re.sub(r'[^\x00-\x7F]+', ' ', str1).decode("utf-8", "ignore") uni2 = re.sub(r'[^\x00-\x7F]+', ' ', str2).decode("utf-8", "ignore") sm_score = SequenceMatcher(None, str1.lower(), str2.lower()).ratio() jar_score = jellyfish.jaro_distance(uni1, uni2) lev_score = jellyfish.levenshtein_distance(uni1, uni2) dl_score = jellyfish.damerau_levenshtein_distance(uni1, uni2) fr_score = fuzz.ratio(str1, str2) fpr_score = fuzz.partial_ratio(str1, str2) ftsortr_score = fuzz.token_sort_ratio(str1, str2) ftsetr_score = fuzz.token_set_ratio(str1, str2) fmean_score = (fr_score + fpr_score + ftsortr_score + ftsetr_score) / 400.00 jar_sm_score = (sm_score + jar_score) / 2 #print(str(fmean_score) + "\t" + str(jar_sm_score)) mean = (fmean_score + jar_sm_score) / 2 return mean
def get_similarity(self, sentence): """ Takes in a sentence and returns how similar it is to another sentence :param sentence: element of self.sentences :type sentence: str :return: score of sentence :rtype: float """ index = self.sentences.index(sentence) if index != len(self.sentences) - 1: scores = [ fuzz.token_sort_ratio(sim, sentence) for sim in self.sentences[index + 1:] ] adjusted_score = -max(scores) / 100 if adjusted_score < -0.85: return 2 * adjusted_score # heavily decrement highly similar sentences elif adjusted_score < -0.6: return adjusted_score # decrement the score of somewhat similar sentences else: return 0 # ignore low levels of similarity else: return 0
def search_api_LCS(self): # dictionary for storing scores for this API self.scores = {} # The API call url suggest = "http://id.loc.gov" + self.query_type + '/suggest/?q=' + urllib.parse.quote(self.name.encode('utf8')) try: # decode the JSON suggest_result = requests.get(suggest).json() # iterate over all results for n in range(len(suggest_result[1])): # get hte "name" candidateS = suggest_result[1][n] # get the URI (LC ID) uriS = suggest_result[3][n].replace('http://id.loc.gov/authorities/names/', '') self.scoreSU = fuzz.token_sort_ratio(candidateS, self.name) # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict if self.scoreSU > self.th: self.scores['LCS'] = {} self.scores['LCS']["lcid"] = {} self.scores['LCS']["lcid"][uriS] = [candidateS, self.scoreSU] except: PrintException(self.log_file, self.name) if len(self.scores) > 0: return self.scores
def dsmatch(name, dataset, fn): """ Fuzzy search best matching object for string name in dataset. Args: name (str): String to look for dataset (list): List of objects to search for fn (function): Function to obtain a string from a element of the dataset Returns: First element with the maximun fuzzy ratio. """ max_ratio = 0 matching = None for e in dataset: if fuzz and name: ratio = fuzz.token_sort_ratio(normalize(name), normalize(fn(e))) if ratio > max_ratio: max_ratio = ratio matching = e elif normalize(name) == normalize(fn(e)): matching = e break return matching
def recommend(project_id: str, model_id: str, text: str) -> List[str]: """ Recommend which intents have similar templates to [text] :param text: texto to look similarities :return: list with intents """ intents = defaultdict(lambda: defaultdict(int)) templates = read_templates(project_id, model_id) for example in templates: intent_name = read_name_by_id(project_id, example["intent_id"]["$oid"]) ratio = fuzz.token_sort_ratio(text, example["name"]) if ratio > 70: intents[intent_name]["times"] += 1 intents[intent_name]["ratio"] += ratio if intent_name in intents: intents[intent_name]["mean_ratio"] = intents[intent_name].get( "ratio") / intents[intent_name].get("times") return sorted(intents, key=lambda x: intents.get(x).get("mean_ratio"), reverse=True)
def _fuzzy_match(self, t_index, max_t): # generic fuzzy matching m_stats = {} # matching statistics content = self._content found = [] if not self._pot_acts else [th[0] for th in self._pot_acts] for tn in self._mind.get_timeline()[::-1]: if tn in found: continue thot = self._mind.get_thots([tn])[tn] if not thot._body: continue thot = thot._head m_stats[thot] = {} m_stats[thot]['ratio'] = fuzz.ratio(content, thot) m_stats[thot]['partial_ratio'] = fuzz.partial_ratio(content, thot) m_stats[thot]['token_sort_ratio'] = fuzz.token_sort_ratio( content, thot) m_stats[thot]['token_set_ratio'] = fuzz.token_set_ratio( content, thot) m_stats['top%s' % max_t] = process.extract(content, t_index, limit=max_t) self._match_stats = m_stats self._pot_acts = m_stats['top%s' % max_t] return m_stats
def test_thing_wikidata_query_strict_False(): """Thing - wikidata - strict=True, check_type=False - : Should pass""" thing = Thing(label=u"혁kστ혁ηjh혁kي혁ةsjdジアh", query_language=Lang.DEFAULT) thing.add_query_endpoint(Endpoint.wikidata) thing.query(strict_mode=True, check_type=False) assert thing.endpoints == set([Endpoint.wikidata]) assert thing.has_label == u'혁kστ혁ηjh혁kي혁ةsjdジアh' assert thing.query_language == Lang.English expected_query = u''' PREFIX wdt: <http://www.wikidata.org/prop/direct/> PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> SELECT DISTINCT ?Thing ?pred ?obj WHERE { SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". } ?Thing ?pred ?obj . { ?Thing rdfs:label "혁kστ혁ηjh혁kي혁ةsjdジアh"@en } UNION { ?Thing wdt:P1813 "혁kστ혁ηjh혁kي혁ةsjdジアh"@en } . } LIMIT 1500''' ratio = fuzz.token_sort_ratio( thing.query_builder.queries[Endpoint.wikidata], expected_query) assert ratio > 90
def fuzzy_search(query, products_list): # split the query into a list of tokens all_tokens = query.split(' ') main_token = all_tokens[-1] # remove the main_token fom the list of tokens all_tokens.pop() prefix_tokens = all_tokens main_token_matches = [] for product in products_list: product_string = product.name + ' ' + product.brand if fuzz.token_set_ratio(main_token, product_string) == 100: main_token_matches.append(product) matching_products = [] for product in main_token_matches: product_string = product.name + ' ' + product.brand match = fuzz.token_set_ratio(prefix_tokens, product_string) score = fuzz.token_sort_ratio(prefix_tokens, product_string) if match == 100: product.set_score(score) matching_products.append(product) sorted_matching_products = sorted(matching_products, key=lambda x: x.score, reverse=True) if len(sorted_matching_products) > 10: sorted_matching_products = sorted_matching_products[:10] return sorted_matching_products
def get_file_url(self, file_string, line): matching_paths = [] file_to_search = file_string user_match = self.file_pattern.search(file_string) if user_match: file_to_search = user_match.group(1) for path in self.current_paths: current_path = path path_match = self.file_pattern.search(path) if path_match: current_path = path_match.group(1) if fuzz.token_sort_ratio(file_to_search, current_path) >= 75: matching_paths.append(path) result = process.extractOne(file_string, matching_paths, scorer=fuzz.token_set_ratio) if result: return "https://github.com/{own}/{repo}/blob/master/{p}{l}".format( own=self.owner, repo=self.repo, p=result[0], l=line if line else '') return None
def anotherSearch(field, data, results_number): from fuzzywuzzy import fuzz from fuzzywuzzy import process aux = {} result = [] mR = 4 mP = 2 mS = 4 for k in data: value_ratio = fuzz.ratio(field, data[k]) value_part = fuzz.partial_ratio(field, data[k]) value_sort = fuzz.token_sort_ratio(field, data[k]) #aux[k]=str(int((value_part* + value_ratio + value_sort) / 3))+":"+str(value_ratio) aux[k] = int((value_part * mP + value_ratio * mR + value_sort * mS) / (mR + mP + mS)) sorted_results = sorted(aux.iteritems(), key=lambda (k, v): v, reverse=True) results_number = results_number if len( sorted_results) >= results_number else len(sorted_results) for x, y in sorted_results[:results_number]: # By value result.append((y, x)) return result
def _is_rdns_match(rdns_list_a: List[x509.RelativeDistinguishedName], rdns_list_b: List[x509.RelativeDistinguishedName]) \ -> Tuple[bool, List[x509.NameAttribute]]: """ Performs fuzzy search to check if two RDNS records are the same. Only checks if the record type exists in both :param rdns_list_a: :param rdns_list_b: :return: """ retval = True # default to assuming same diff_list = list() for rdns_a in rdns_list_a: for name_attr_a in rdns_a: for rdns_b in rdns_list_b: for name_attr_b in rdns_b: assert isinstance(name_attr_a, x509.NameAttribute) assert isinstance(name_attr_b, x509.NameAttribute) # if OID matches, compare their values if name_attr_a.oid == name_attr_b.oid and name_attr_a.value != name_attr_b.value: # does fuzzy search to check if there is < 80% match b/w values if fuzz.token_sort_ratio(name_attr_a.value, name_attr_b.value) < 80 and not \ _special_case_nameattr_equivalence(name_attr_a, name_attr_b): retval = False diff_list.append((name_attr_a, name_attr_b)) return retval, diff_list
def fastExact_function(search_subject): fast_url = api_base_url + '?&query=' + search_subject fast_url += '&queryIndex=suggestall&queryReturn=suggestall,idroot,auth,tag,raw&suggest=autoSubject&rows=5&wt=json' try: data = requests.get(fast_url).json() for item in data: if item == 'response': response = data.get(item) if response.get('numFound') > 0: for metadata in response: if metadata == 'docs': keyInfo = response.get(metadata) for info in keyInfo: auth_name = info.get('auth') fast_id = info.get('idroot') ratio = fuzz.token_sort_ratio(auth_name, search_subject) if auth_name == search_subject or ratio == 95: result_dict['auth_name'] = auth_name result_dict['fast_id'] = fast_id break else: pass except ValueError: pass
def fungsi(): while(1): menu() pil = input("Pilhan anda : ") if pil == "1": kata1 = input("Masukan kata 1 : ") kata2 = input("Masukan kata 2 : ") print("Nilai : ", fuzz.ratio(kata1,kata2)) elif pil == "2": kata1 = input("Masukan kata 1 : ") kata2 = input("Masukan kata 2 : ") print("Nilai : ", fuzz.partial_ratio(kata1, kata2)) elif pil == "3": kata1 = input("Masukan kata 1 : ") kata2 = input("Masukan kata 2 : ") print("Nilai : ", fuzz.token_sort_ratio(kata1, kata2)) elif pil == "4": kata1 = input("Masukan kata 1 : ") kata2 = input("Masukan kata 2 : ") print("Nilai : ", fuzz.token_set_ratio(kata1, kata2)) elif pil == "5": exit() else: print("Pilihan anda salah!!")
def print_sim_compare(t1, t2, stopwords): """ Debug/testing function. Prints out similarity scores. :param t1: str, text to compare :param t2: str, text to compare against """ t1 = get_nouns(tokenize_and_stem(t1.strip(), stopwords)) t2 = get_nouns(tokenize_and_stem(t2.strip(), stopwords)) print('Jaccard:', jaccard_similarity(t1, t2)) print('Ratio:', fuzz.ratio(" ".join(t1), " ".join(t2)) / 100) print('Partial Ratio:', fuzz.partial_ratio(" ".join(t1), " ".join(t2)) / 100) print('Token Set Ratio:', fuzz.token_set_ratio(" ".join(t1), " ".join(t2)) / 100) print('Token Sort Ratio:', fuzz.token_sort_ratio(" ".join(t1), " ".join(t2)) / 100) # Calculate similarity score print("noun tokens", t1, t2) score = calc_similarity_score(t1, t2) print("Score would be:", score)
def matching(value1, value2, factor=None): """ :rtype : object """ fuzzy = [] fuzzy.append(fuzz.ratio(value1.lower(), value2.lower())) fuzzy.append(fuzz.partial_ratio(value1.lower(), value2.lower())) fuzzy.append(fuzz.token_set_ratio(value1.lower(), value2.lower())) fuzzy.append(fuzz.token_sort_ratio(value1.lower(), value2.lower())) log.debug("=" * 50) log.debug('Fuzzy Compare: {} - {}'.format(value1.lower(), value2.lower())) log.debug("-" * 50) log.debug('{}: Simple Ratio'.format(fuzzy[0])) log.debug('{}: Partial Ratio'.format(fuzzy[1])) log.debug('{}: Token Set Ratio'.format(fuzzy[2])) log.debug('{}: Token Sort Ratio'.format(fuzzy[3])) if factor: # Will return True or False log.debug('Return with Factor - {}: {}'.format(factor, any([fr > factor for fr in fuzzy]))) return any([fr >= factor for fr in fuzzy]) score = 0 entries = 0 for fr in fuzzy: score += fr if fr > 0: entries += 1 if entries > 0: score = score/entries else: score = 0 log.debug('Return without Factor - Score: {}'.format(score)) return score
def update(self, events) -> None: screen.blit(self.image, self.rect) if not self.done: if self.rect.collidepoint(pygame.mouse.get_pos()) and \ pygame.mouse.get_pressed()[0] and not self.asking: self.asking = True door_open.play() else: if self.asking and not self.rect.collidepoint( pygame.mouse.get_pos()) and \ pygame.mouse.get_pressed()[0]: self.asking = False door_close.play() if self.asking: self.input.update(events) inp = self.input.get_surface() inp_rect = inp.get_rect() inp_rect.centerx = WIDTH // 2 inp_rect.y = self.rect.y - 40 screen.blit(inp, inp_rect) q = question_font.render(self.question, True, BLACK) screen.blit(q, (WIDTH // 2 - q.get_width() // 2, 50)) for event in events: if event.type == KEYDOWN: if event.key == K_RETURN: if fuzz.token_sort_ratio(self.answer, self.input.get_text() ) > 85: self.asking = False self.image = self.door_done self.done = True door_close.play() player.pass_door(self.part_name, self.question) else: wrong_sound.play()
def extract_features(df): df["question1"] = df["question1"].fillna("").apply(preprocess) df["question2"] = df["question2"].fillna("").apply(preprocess) print("token features...") token_features = df.apply(lambda x: get_token_features(x["question1"], x["question2"]), axis=1) df["cwc_min"] = list(map(lambda x: x[0], token_features)) df["cwc_max"] = list(map(lambda x: x[1], token_features)) df["csc_min"] = list(map(lambda x: x[2], token_features)) df["csc_max"] = list(map(lambda x: x[3], token_features)) df["ctc_min"] = list(map(lambda x: x[4], token_features)) df["ctc_max"] = list(map(lambda x: x[5], token_features)) df["last_word_eq"] = list(map(lambda x: x[6], token_features)) df["first_word_eq"] = list(map(lambda x: x[7], token_features)) df["abs_len_diff"] = list(map(lambda x: x[8], token_features)) df["mean_len"] = list(map(lambda x: x[9], token_features)) print("fuzzy features..") df["token_set_ratio"] = df.apply(lambda x: fuzz.token_set_ratio(x["question1"], x["question2"]), axis=1) df["token_sort_ratio"] = df.apply(lambda x: fuzz.token_sort_ratio(x["question1"], x["question2"]), axis=1) df["fuzz_ratio"] = df.apply(lambda x: fuzz.QRatio(x["question1"], x["question2"]), axis=1) df["fuzz_partial_ratio"] = df.apply(lambda x: fuzz.partial_ratio(x["question1"], x["question2"]), axis=1) df["longest_substr_ratio"] = df.apply(lambda x: get_longest_substr_ratio(x["question1"], x["question2"]), axis=1) return df
def build_handcraft_text_feats(tmp_df): df = tmp_df.copy() df['len_title_1'] = df.title_1.apply(lambda x: len(str(x))) df['len_title_2'] = df.title_2.apply(lambda x: len(str(x))) df['diff_len'] = df.len_title_1 - df.len_title_2 df['abs_diff_len'] = abs(df.len_title_1 - df.len_title_2) df['len_char_title_1'] = df.title_1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_char_title_2'] = df.title_2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_word_title_1'] = df.title_1.apply(lambda x: len(str(x).split())) df['len_word_title_2'] = df.title_2.apply(lambda x: len(str(x).split())) df['common_words'] = df.apply( lambda x: len(set(str(x['title_1']).lower().split()).intersection(set(str(x['title_2']).lower().split()))), axis=1) df['fuzz_qratio'] = df.apply(lambda x: fuzz.QRatio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_WRatio'] = df.apply(lambda x: fuzz.WRatio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_partial_token_set_ratio'] = df.apply( lambda x: fuzz.partial_token_set_ratio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_partial_token_sort_ratio'] = df.apply( lambda x: fuzz.partial_token_sort_ratio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['title_1']), str(x['title_2'])), axis=1) df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['title_1']), str(x['title_2'])), axis=1) df['txt_hamming'] = df.apply( lambda x: textdistance.hamming.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1) df['txt_damerau_levenshtein'] = df.apply( lambda x: textdistance.damerau_levenshtein.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1) df['txt_jaro_winkler'] = df.apply( lambda x: textdistance.jaro_winkler.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1) df['txt_overlap'] = df.apply( lambda x: textdistance.overlap.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1) df['txt_mra'] = df.apply(lambda x: textdistance.mra.normalized_similarity(str(x['title_1']), str(x['title_2'])), axis=1) df.drop(columns=['title_1', 'title_2'], inplace=True) return df
def calculateMetric(self, strA, strB, metric): if metric == 'ratio': return fuzz.ratio(strA, strB) if metric == 'partial_ratio': return fuzz.partial_ratio(strA, strB) if metric == 'token_sort_ratio': return fuzz.token_sort_ratio(strA, strB) if metric == 'token_set_ratio': return fuzz.token_set_ratio(strA, strB) if metric == 'distance': return Levenshtein.distance(strA, strB) if metric == 'l_ratio': return Levenshtein.ratio(strA, strB) if metric == 'jaro': return Levenshtein.jaro(strA, strB) if metric == 'jaro_winkler': return Levenshtein.jaro_winkler(strA, strB) if metric == 'setratio': return Levenshtein.setratio(strA, strB) if metric == 'seqratio': return Levenshtein.seqratio(strA, strB) if metric == 'longestnumericseq': return longestNumericSubstringMetric(strA, strB) return None
def get_largest_match(title, skills, ngram): # make a default matching degree and title # skills=soft_skills ngram = ngram dic = {} match_degree = 0 match_skill = '' match_gram = '' # try every skill in skill list for sk in skills: #print(sk) match_degree = 0 # get a list of all possible distributions like : n_Gram_list = get_ngrams(title, ngram) # try every distributions in distributions list "n_Gram_list" for gram in n_Gram_list: #print(gram) # get similarity degree of job title "gram" and our title from title list similarity_degree = fuzz.token_sort_ratio(sk, gram) # SequenceMatcher(None,gram.lower(),i.lower()).ratio() # get the most similarity butween the new and old(or default) if similarity_degree > match_degree: match_degree = similarity_degree match_skill = sk match_gram = gram dic[match_skill] = match_degree # if(title=="Eastern Province"): # print(match_skill,":",match_gram,":",match_degree) dic = sorted(dic.items(), key=lambda x: x[1], reverse=True) return dic
def search_api_LC(self): # dictionary for storing scores for this API self.scores = {} # The API call url dym = "http://id.loc.gov" + self.query_type + "/didyoumean/?label=" + urllib.parse.quote(self.name.encode('utf8')) try: dym_result = requests.get(dym) # get the results in form of a XML tree dym_results = ETree.fromstring(dym_result.content) for result in dym_results.iter('{http://id.loc.gov/ns/id_service#}term'): # get the "name" candidateD = result.text # get the URI (LC ID) uriD = result.get('uri') scoreD = fuzz.token_sort_ratio(candidateD, self.name) # if the similarity socre is greater than the cut-off, add the "name" LC ID and similarity score to the dict if scoreD > self.th: self.scores['LC'] = {} self.scores['LC']['lcid'] = {} self.scores['LC']["lcid"][uriD] = [candidateD, scoreD] except: PrintException(self.log_file, self.name) if len(self.scores) > 0: return self.scores