def get_byline_image(self, force_new=False): slugify = Slugify() if not force_new and self.byline_photo: return self.byline_photo imagefiles = glob.glob(os.path.join(settings.BYLINE_PHOTO_DIR, "*.jpg")) name = self.name name_last_first = re.sub(r"^(.*) (\S+)$", r"\2 \1", name) name_slug_title = slugify(name) + ".jpg" name_slug = name_slug_title.lower() name_slug_reverse = slugify(name_last_first).lower() + ".jpg" bestratio = 90 bestmatch = None for path in imagefiles: filename = os.path.split(path)[1].lower() ratio = max(fuzz.ratio(filename, name_slug), fuzz.ratio(filename, name_slug_reverse)) if ratio > bestratio: bestmatch = path bestratio = ratio if ratio == 100: break if bestmatch: msg = "found match: name:{}, img:{}, ratio:{} ".format(name_slug, bestmatch, ratio) logger.debug(msg) with open(bestmatch, "rb") as source: content = File(source) img = ProfileImage() img.source_file.save(name_slug_title, content) self.byline_photo = img self.save() return img
def findCoord(fileName, sttlReg, fWriter): """ Finds the coordinates, last region and the province belonging to for the sttls from cornu file and write them all together in a csv file. """ sttlName = sttlReg.split('-')[0] with open(fileName, "r", encoding="utf8") as jsonFile: allData = json.load(jsonFile) for d in allData["data"]: fName = d["arTitle"] sName = d["arTitleOther"].split(",") #print("stl: ", sttlName, "stl normal: ", normalizeArabic(sttlName), # if name == fName: # check if it finds similar words with arTitle, using fuzzywuzzy library if sttlReg and fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(fName))>= 90: # if sttlReg and normalizeArabic(sttlName) == normalizeArabic(fName): fWriter.writerow([sttlName, fName, "/".join(sName), d["lat"], d["lon"], d["region"], sttlReg.split('-')[1], sttlReg.split('-')[2], d["eiSearch"], d["translitTitle"], fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(fName))]) else: for n in sName: n = n.strip() # if name == n.strip(): # check if it finds similar words with arTitleOther, using fuzzywuzzy library if sttlReg and fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(n))>= 90: # if sttlReg and normalizeArabic(sttlName) == normalizeArabic(n): fWriter.writerow([sttlName, fName, n, d["lat"], d["lon"], d["region"], sttlReg.split('-')[1], sttlReg.split('-')[2], d["eiSearch"], d["translitTitle"], fuzz.ratio(normalizeArabic(sttlName), normalizeArabic(n))]) break
def get_byline_image(self, force_new=False): slugify = Slugify(to_lower=True) if not force_new and self.byline_photo: return self.byline_photo imagefiles = glob.glob(BYLINE_PHOTO_FOLDER + '/*.jpg') name = self.name.lower() name_last_first = re.sub(r'^(.*) (\S+)$', r'\2 \1', name) name_slug = slugify(name) + '.jpg' name_slug_reverse = slugify(name_last_first) + '.jpg' bestratio = 90 bestmatch = None for path in imagefiles: filename = os.path.split(path)[1].lower() ratio = max( fuzz.ratio(filename, name_slug), fuzz.ratio(filename, name_slug_reverse) ) if ratio > bestratio: bestmatch = path bestratio = ratio if ratio == 100: break if bestmatch: msg = 'found match: name:{}, img:{}, ratio:{} '.format( name_slug, bestmatch, ratio) logger.debug(msg) img, _ = ImageFile.objects.get_or_create(source_file=bestmatch) img.autocrop() self.byline_photo = img self.save() return img
def has_similar(self, url): ''' ''' if url == "": return (False,[], []) if self.has_node( url): return (True,[], self.search_url_index(url)) else: max_rate = 0 url_split_list, host_po = up.url_split( url) max_list =[] max_url = "" rate = 0 for my_url in self.treeContent: if fuzz.ratio( os.path.splitext( my_url)[1], os.path.splitext( url)[1])<SIMILAR_THRESHOLD: # if not has the same expend name continue if (fuzz.ratio(url, my_url)/100.0)< self.SIMILAR_THRESHOLD: continue my_url_list, my_host_po = up.url_split(my_url) rate, dismatch_list = up.url_list_compare(url_split_list, my_url_list) rate = max( fuzz.ratio(url, my_url)/100.0, rate) if max_rate < rate: max_rate = rate max_list = dismatch_list max_url = my_url if max_rate > self.SIMILAR_THRESHOLD and max_url != "": print "SIMILAR URL(Tree.has_similar):\n",max_url,"\n",url return (True, dismatch_list, self.search_url_index(max_url)) return (False, max_list, [])
def parseTweet(tweet): tweet['created_at'] = tweet['created_at'].split('+')[0] tweet['timestampint'] = int(tweet['timestamp_ms']) isFirst = True words = map(lambda x:x.lower(), tweet["text"].split(' ')) newTweets = [] theTweet = tweet prevWord = None for location in LOCATIONS: for word in words: score = fuzz.ratio(word, location) if score > scoreMap[location]: if not isFirst: tweet = cloneTweet(theTweet) tweet["location"] = location isFirst = False newTweets.append(tweet) break score = fuzz.ratio(word, 'nagar') if score > 90: score = fuzz.ratio(prevWord+word, location) if score > scoreMap[location]: if not isFirst: tweet = cloneTweet(theTweet) tweet["location"] = location isFirst = False newTweets.append(tweet) break prevWord = word for key, values in TRANSPORT_MAP.iteritems(): done = False for value in values: if done: break for word in words: score = fuzz.ratio(word, value) if score > scoreMap[value]: if not isFirst: tweet = cloneTweet(theTweet) tweet["transport"] = key isFirst = False newTweets.append(tweet) done = True break for key, values in SERVICE_MAP.iteritems(): done = False for value in values: if done: break for word in words: score = fuzz.ratio(word, value) if score > scoreMap[value]: if not isFirst: tweet = cloneTweet(theTweet) tweet["service"] = key isFirst = False newTweets.append(tweet) done = True break return newTweets
def _employees(self, company_name="", keyword=""): ''' Linkedin Scrape ''' # TODO - add linkedin directory search ''' Linkedin Scrape''' args = '-inurl:"/dir/" -inurl:"/find/" -inurl:"/updates"' args = args+' -inurl:"job" -inurl:"jobs2" -inurl:"company"' qry = '"at {0}" {1} {2} site:linkedin.com' qry = qry.format(company_name, args, keyword) results = Google().search(qry, 10) results = results.dropna() results = Google()._google_df_to_linkedin_df(results) _name = '(?i){0}'.format(company_name) if " " in company_name: results['company_score'] = [fuzz.partial_ratio(_name, company) for company in results.company] else: results['company_score'] = [fuzz.ratio(_name, company) for company in results.company] if keyword != "": results['score'] = [fuzz.ratio(keyword, title) for title in results.title] results = results[results.score > 75] results = results[results.company_score > 64] results = results.drop_duplicates() data = {'data': results.to_dict('r'), 'company_name':company_name} CompanyExtraInfoCrawl()._persist(data, "employees", "") job = rq.get_current_job() print job.meta.keys() if "queue_name" in job.meta.keys(): if RQueue()._has_completed(job.meta["queue_name"]): q.enqueue(Jigsaw()._upload_csv, job.meta["company_name"]) return results
def Handle(self, query): voters = Voter.objects.exclude(last_name=None).exclude(first_name=None) ## if searchTerm.date_of_birth != None and searchTerm.date_of_birth != "": ## voters = voters.filter(date_of_birth=searchTerm.date_of_birth) ## Validate and initialize response results validationMessages = self.Validate(query) firstNameMatches = [] lastNameMatches = [] if validationMessages == None: ## Pull voters with the same initials voters = self.FilterByInitial(query, voters) ## Run search algorithm on first and last names for voter in voters: voter.first_name_ratio = fuzz.ratio(query.last_name, voter.last_name) if voter.first_name_ratio > SearchVotersQueryHandler.RatioThreshold: lastNameMatches.append(voter) for voter in lastNameMatches: voter.last_name_ratio = fuzz.ratio(query.first_name, voter.first_name) if voter.last_name_ratio > SearchVotersQueryHandler.RatioThreshold: firstNameMatches.append(voter) # sorting firstNameMatches.sort(key=lambda x: x.ratioAvg(), reverse=True) return { 'voters': firstNameMatches[:SearchVotersQueryHandler.MaxResults], 'searchTerm': query, 'validation': validationMessages }
def fuzzy_match(college_niche_row, admithub_name, highest_match): higher = False college_niche_name = college_niche_row[1] if fuzz.ratio(re.sub(r"-", " ",college_niche_name.lower() ), re.sub(r"-", " ", admithub_name.lower() ) ) > highest_match: highest_match = fuzz.ratio(re.sub(r"-", " ",college_niche_name.lower() ), re.sub(r"-", " ", admithub_name.lower() ) ) higher = True return highest_match, higher
def _get_tvmaze_data(self, show, title): show_r = requests.get('http://api.tvmaze.com/search/shows', params={'q': show}) show_search_data = json.loads(show_r.text) best_show_ratio = 0 best_show_name = None if not self.tvmaze_show_id: for i in show_search_data: show_ratio = fuzz.ratio(show, i['show']['name']) if show_ratio > best_show_ratio: # name_match_ratio = SequenceMatcher(None, show.lower(), # show_search_data[i]['show']['name'].lower()).ratio() # if name_match_ratio > 0.9: self.tvmaze_show_id = i['show']['id'] best_show_ratio = show_ratio best_show_name = i['show']['name'] break if self.tvmaze_show_id is None: raise ValueError('Could not match TV show data') print('I believe the show name is {}'.format(best_show_name)) if not self.tvmaze_episode_data: episode_r = requests.get('http://api.tvmaze.com/shows/{}/episodes?specials=1'.format(self.tvmaze_show_id)) self.tvmaze_episode_data = json.loads(episode_r.text) best_ratio = 0 best_obj = None for i in self.tvmaze_episode_data: episode_ratio = fuzz.ratio(title, i['name']) # print('{} - {} = {}'.format(title, i['name'], episode_ratio)) if episode_ratio > best_ratio: best_ratio = episode_ratio best_obj = i print('I believe the episode name is {} (s{} e{})'.format(best_obj['name'], best_obj['season'], best_obj['number'])) return best_obj
def fuzz_roadnames(roadn): #create empty lists for testing fuzztest = [] fuzztestNoSP = [] #loop through road names and append the fuzzy result to the fuzztest and NoSpace lists for cn in roadn: #testname lowercase fuzztest.append(fuzz.ratio(cn,testname)) #testname without spaces fuzztestNoSP.append(fuzz.ratio(cn,testnameNoSP)) # force ratio list results into a numpy array # find max ratio and indicies where max ratio exists fuzztest = np.array(fuzztest) maxrat = np.max(fuzztest) indies = np.where(fuzztest == maxrat) fuzztestNoSP = np.array(fuzztestNoSP) maxratNoSP = np.max(fuzztestNoSP) indiesNoSP = np.where(fuzztestNoSP == maxratNoSP) print '\n\n\n\n\nREGULAR TEST' for i in indies[0]: print roadn[i] print '\n\n\n\n\nNO SPACE TEST' for i in indiesNoSP[0]: print roadn[i]
def get_twitter_facebook_google_id(url_instance, url_link): twitter_base = 'twitter.com/' facebook_base = 'facebook.com/' google_base = 'play.google.com/store/apps/details?' request = url_instance links = re.findall('"((http|ftp)s?://.*?)"', str(request)) twitter_IDs = [] facebook_IDs = [] google_IDs = [] if len(links) == 0: return twitter_IDs, facebook_IDs, google_IDs for link in links: link = link[0] if twitter_base in link: twitter_IDs += [link] elif facebook_base in link: facebook_IDs += [link] elif google_base in link: google_IDs += [link] twitter_IDs = [link.split('.com/', 1)[1] for link in twitter_IDs] twitter_return = [ID for ID in twitter_IDs if fuzz.ratio(ID, url_link) > 75] facebook_IDs = [link.split('.com/', 1)[1] for link in facebook_IDs] facebook_return = [ID for ID in facebook_IDs if fuzz.ratio(ID, url_link) > 75] google_IDs = [link.split('.com/store/apps/details?', 1)[1] for link in google_IDs] google_return = [ID for ID in google_IDs] return twitter_return, facebook_return, google_return
def get(self, count): self.set_header("Content-Type", "application/json") title, artist = self.get_argument("title"), self.get_argument("artist") title, artist = clean(title), clean(artist) url = "http://developer.echonest.com/api/v4/song/search?api_key=%s&format=json&results=100&artist=%s&combined=%s" origin = self.request.remote_ip reqHeader = {'X-Forwarded-For': origin} echoTracks = [] try: reqUrl = url % (echoKeys.next(), quote_plus(artist), quote_plus(title + " " + artist)) getReq = tornado.httpclient.HTTPRequest(reqUrl, headers=reqHeader) res = yield tornado.gen.Task(async_client.fetch, getReq) response = json.loads(res.body)['response'] if len(response['songs'])>0: results = response['songs'] for r in results: score = fuzz.ratio(title, clean(r['title'])) + fuzz.ratio(artist, clean(r['artist_name'])) r['score'] = (score/200.0)*100 sorted_results = sorted(results, key=lambda r: r['score'])[::-1] echoTracks = [d for d in sorted_results if d['score']>=90][0:int(count)] except Exception as e: pass finally: self.write(json.dumps(echoTracks)) self.finish()
def fuzzName(src_table,dst_table,catid): table_name = "result_"+catid+"_"+time.strftime("%Y_%m_%d_%H_%M_%S") table_query = "CREATE TABLE "+table_name+" (src_game_name mediumtext, dst_game_name mediumtext, src_buying_price float, dst_selling_price float, has_number int, match_ratio int)" my_cursor.execute(table_query) src_query = 'select game_name, selling_price, buying_price from '+src_table my_cursor.execute(src_query) src_rows = my_cursor.fetchall() dst_query = 'select game_name, selling_price, buying_price from '+dst_table my_cursor.execute(dst_query) dst_rows = my_cursor.fetchall() src_names = [src_row[0].strip() for src_row in src_rows] dst_names = [dst_row[0].strip() for dst_row in dst_rows] match = [src_name for src_name in src_names for dst_name in dst_names if src_name == dst_name] for m in match: fillResultTable(table_name, src_table, dst_table, m, m, '0', '100') if m in dst_names: dst_names.remove(m) if m in src_names: src_names.remove(m) for src_name in src_names: if hasNumbers(src_name) == False: for dst_name in dst_names: if hasNumbers(dst_name) == False: ratio = fuzz.ratio(src_name,dst_name) if ratio >= 80: print src_name+"--"+dst_name fillResultTable(table_name, src_table, dst_table, src_name, dst_name, '0', ratio) if dst_name in dst_names: dst_names.remove(dst_name) if src_name in src_names: src_names.remove(src_name) break else: for dst_name in dst_names: ratio = fuzz.ratio(src_name,dst_name) if hasNumbers(src_name[-1]) and hasNumbers(dst_name[-1]): if ratio >= 80 and src_name[-1] == dst_name[-1]: print src_name+"--"+dst_name fillResultTable(table_name, src_table, dst_table, src_name, dst_name, '1', ratio) if dst_name in dst_names: dst_names.remove(dst_name) if src_name in src_names: src_names.remove(src_name) break db.commit() query = 'alter table '+table_name+' add column price_difference float' my_cursor.execute(query) query = 'update '+table_name+' set `price_difference` = (`src_buying_price`-`dst_selling_price`)' my_cursor.execute(query) query = 'alter table '+table_name+' add column gain_percentage int' my_cursor.execute(query) query = 'update '+table_name+' set `gain_percentage` = (`price_difference`/`dst_selling_price`)*100' my_cursor.execute(query) db.commit()
def best_ch_fuzz(title): highest_ratio = 0 best_match = u'' for key in ch_titles.keys(): if fuzz.ratio(title,key)>highest_ratio: best_match=key highest_ratio=fuzz.ratio(title,key) return best_match
def _is_fuzzy_match(s1, s2, threshold=90): if isinstance(s2, dict): best_match = process.extractOne(s1, s2.values(), score_cutoff=threshold) return True if best_match else False #for debugging TODO: REMOVE ratio = fuzz.ratio(s1, s2) # print '{0} ==> {1} :: {2}'.format(s1, s2, ratio) return fuzz.ratio(s1, s2) > threshold
def highest_fuzz(input_list, input_item): highest_ratio = 0 best_match = u'' for item in input_list: if fuzz.ratio(input_item,item)>highest_ratio: best_match=item highest_ratio=fuzz.ratio(input_item,item) return best_match
def get_sefaria_english_parsha(parsha_name): highest_ratio=0 return_title = 0 for sefaria_parsha_name in eng_parshiot: if fuzz.ratio(parsha_name,sefaria_parsha_name)>highest_ratio: return_title=sefaria_parsha_name highest_ratio=fuzz.ratio(parsha_name,sefaria_parsha_name) return return_title
def show_folders(folders, mdfind_results3, searchword, skiplist): """ @type folders: list @type mdfind_results3: list @type searchword: str @type skiplist: list @return: None """ folders2 = [] if len(folders) > 0 and len(mdfind_results3) < 50: print() print("\033[91m[" + searchword + "] Folders:\033[0m") last = None nextcnt = 0 for i in folders: nextcnt += 1 skip = False skipi = i.lower() nexti = None try: nexti = folders[nextcnt] except IndexError: pass for item in skiplist: item = item.lower() if item in skipi: skip = True if last and fuzz.ratio(i, last) > 85: skip = True if len(folders) < 10: skip = False if not skip: # if last and (pp(i) == pp(last) or fuzz.ratio(i, nexti) > 70): # folders2.append("\033[90m" + str(i) + "\033[0m") # else: newi = "" if nexti: if fuzz.ratio(i, nexti) > 90: newi = "\033[90m" + str(os.path.dirname(i)) + "\033[34m/" + str(os.path.basename(i)) + "\033[0m" if newi == "": newi = "\033[34m" + str(i) + "\033[0m" folders2.append(newi) folders2.sort(key=lambda x: (x.count("/"), len(x), x)) folders2.reverse() for i in folders2: print(i)
def analysis(): MID_MANAGER_LEVEL = ['senior', 'sr'] MANAGER_LEVEL = ['manager', 'lead', 'head', 'leader', 'gerente','specialist'] DIRECTOR_LEVEL = ['director', 'partner', 'general', 'managing', 'gm', 'dgm', 'agm'] BOARD_LEVEL = ['president','md','vice','vp', 'avp', 'entrepreneur', 'owner', 'proprietor', 'chairman', 'founder', 'board', 'chief', 'ceo', 'cto', 'cfo', 'coo', 'cro', 'cmo', 'cso', 'cio'] mongo_server = MongoClient(['dev-mongo2.grownout.com:27017','dev-mongo1.grownout.com:27017'],replicaset='amoeba-mongo') designation_db = mongo_server['designation_database'] designation_prob_collection = designation_db.prob_collection designation_probs={} for designation in designation_prob_collection.find(): designation_probs[designation['name']] = designation # naukri_tittle_exp={} print designation_probs for k in jobs.find(): designation=k['title'].replace("-"," ") designation=re.sub('[^0-9a-zA-Z]+', ' ', designation) designation_words=designation.split(" ") prob_designation=[0.0]*4 prob_experience=[0.0]*4 for word in designation_words: if word.lower() in designation_probs: # print "entered true" prob_designation[0]+=float(designation_probs[word.lower()]['level_1']) prob_designation[1]+=float(designation_probs[word.lower()]['level_2']) prob_designation[2]+=float(designation_probs[word.lower()]['level_3']) prob_designation[3]+=float(designation_probs[word.lower()]['level_4']) # print prob_designation,"this is prob_designation", " ",designation # break design_probs =prob_designation.index(max(prob_designation)) # print design_probs match_designation=[0.0]*4 for j in MID_MANAGER_LEVEL: match_designation[0]=max(fuzz.ratio(designation,j),match_designation[0]) for j in MANAGER_LEVEL: match_designation[1]=max(fuzz.ratio(designation,j),match_designation[1]) for j in DIRECTOR_LEVEL: match_designation[2]=max(fuzz.ratio(designation,j),match_designation[2]) for j in BOARD_LEVEL: match_designation[3]=max(fuzz.ratio(designation,j),match_designation[3]) min_exp_probs=prob_workex(float(k['experience']['min'])) max_exp_probs=prob_workex(float(k['experience']['max'])) # print "min ",min_exp_probs # print "max ",max_exp_probs for i in range(4): prob_experience[i]+=(((min_exp_probs[i])*0.8)+((max_exp_probs[i])))/1.8 # print prob_experience," prob experience" design_exp=prob_experience.index(max(prob_experience)) # print match_designation,"this is match designation" design_match=match_designation.index(max(match_designation)) # if design_exp!=design_probs: print design_match,":",design_probs,":",design_exp," ",k['title']
def compare_strings(string_one, string_two): highest_ratio = 0 if fuzz.ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.ratio(string_one, string_two) if fuzz.token_sort_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_sort_ratio(string_one, string_two) if fuzz.token_set_ratio(string_one, string_two)>highest_ratio: highest_ratio = fuzz.token_set_ratio(string_one, string_two) return highest_ratio
def OnChanMsg(self, nick, channel, message): if not channel.GetName() == "#xshellz": return znc.CONTINUE msg = str(message) msg = msg.split(" ") cmd = msg[0].lower() nickn = nick.GetNick() try: username = msg[1].lower() except IndexError: return znc.CONTINUE if cmd == '!keep' or cmd == '!approve': self.buffer[nickn] = dict(ni=nick,user=username) else: if not nickn == 'xinfo': return znc.CONTINUE res = self.appr.search(msg) if not res: res = self.kp.search(msg) if not res: return znc.CONTINUE nck = res.group(1) user = res.group(2) nd = self.buffer.get(nck,None) if nd: self.buffer.pop(nck,None) if user == nd['user']: nu = nd['ni'] self.nd[user]['nick'].append(nck) host = nu.GetHost() if host == 'shell.xshellz.com': host = None self.nd['user']['hosts'].append(host) for z,x in self.nd.items(): if z == user: continue if nck in x['nick']: self.svreport("{0} is in the nick list for {1} ({2}) but user requested is {3} !att-nick-match".format(nck, z, " ,".join(x['nick']),user)) got = True else: for y in x['nick']: if fuzz.ratio(y,host) >= 50: self.svreport("{0} is a fuzzy match against {1}'s nick {2}, but the user is {3} !att-nick-fuzzy-match".format(nck,z,y,user)) if host in x['hosts']: got = True self.svreport("{0}'s host ({4}) is in the host list for {1} ({2}) but user requested is {3} !att-host-match".format(nck, z, " ,".join(x['hosts']),user,host)) else: for y in x['hosts']: if fuxx.ratio(y,host) >= 95: self.svreport("{0} is a fuzzy match against {1}'s host {2}, but the user is {3} !att-host-fuzzy-match".format(host,z,y,user)) if fuzz.ratio(z,user) >= 50: self.svreport("{0} is a fuzzy match against {1} !att-user-fuzzy-match".format(z,user)) if got: break self.write() return znc.CONTIUE
def print_list(self, match=0): for key, value in self._items.iteritems(): if not type(value).__name__ == "KeychainItem": if not match: self.stdout.write("%s\n" % key) else: regex = re.compile(".*"+match+".*") if (fuzz.ratio(match, key) > 50) or (fuzz.ratio(match, key) > 25 and regex.match(key)): self.stdout.write("%s\n" % key) return
def validate(document, parsed_query): # Need to write raw_content = extraction.get_raw_content(document) extract_text = extraction.get_text(document) matchword = parsed_query["required_match_field"] #Validation fields lower_raw_content = raw_content.lower() lower_extract_text = extract_text.lower() for feature in matchword: isValid = False #Check first if the feature value in text or not if type(matchword[feature]) is list: for value in matchword[feature]: if value.lower() in lower_raw_content or value.lower() in lower_extract_text: #Either in the raw_content or extract_text is regarded as valid isValid = True break else: if feature == "location": #Special case for location which has two fields(city and state) in value location_fields = [field.strip() for field in matchword[feature].split(",")] for location_field in location_fields: if location_field in state_abbr_dic: #Validate state should be case sensitive state_pattern = r"(?:[^A-Za-z])("+location_field+")(?:[^A-Za-z])" if re.search(state_pattern,raw_content) or re.search(state_pattern,extract_text) or state_abbr_dic[location_field].lower() in lower_extract_text or state_abbr_dic[location_field].lower in lower_raw_content: #Check if a state abbr or its full name is in raw_content or extracted_text field isValid = True else: if location_field.lower() in lower_raw_content or location_field.lower() in lower_extract_text: isValid = True else: if matchword[feature].lower() in lower_raw_content or matchword[feature].lower() in lower_extract_text: isValid = True if isValid: continue #If the document does not contain the raw string, do extractions and match results = extraction.functionDic[feature](document,True) for result in results: if feature == "phone": #phone number has to be exactly the same while other features tolerate some minor difference if result == re.sub("\D","",matchword[feature]): isValid = True else: if fuzz.ratio(str(result),matchword[feature])>=80: isValid = True break if extract_text: results = extraction.functionDic[feature](document,False) for result in results: if feature == "phone": #phone number has to be exactly the same while other features tolerate some minor difference if result == re.sub("\D","",matchword[feature]): isValid = True else: if fuzz.ratio(str(result),matchword[feature])>=80: isValid = True break if isValid: continue else: return False return True
def get_article_groups(article_list, fuzz_len=3000, min_ratio=70): """ finds duplicate articles in a list of articles. arguments: article_list: list of articles fuzz_len (int, default=3000): number of characters at beginning/end of article to compare against using fuzz min_ratio (int, default=70): minimum fuzz score (as percentage) above which two articles are considered duplicates. returns: list of ints for each article in article_list, where each distinct int corresponds to a distinct group of duplicate articles. """ if len(article_list) == 1: return [article_list] else: rep_articles = [] rep_ids = [] rep_num = -1 fuzzratio = 0 for article in article_list: a_content = article[3] a_len = len(a_content) matched = False for i in range(len(rep_articles)): rep_content = rep_articles[i][3] rep_len = len(rep_content) minlen = min(fuzz_len, a_len, rep_len) fuzzratio = fuzz.ratio(rep_content[:minlen], a_content[:minlen]) # we compare the beginnings and ends of both articles using fuzz. # it's possible that some of these checks are mostly useless; this is slow. # but for deduplication I've chosen to be careful. if fuzzratio < min_ratio: fuzzratio = fuzz.ratio(rep_content[-minlen:], a_content[-minlen:]) if fuzzratio < min_ratio: fuzzratio = fuzz.ratio(rep_content[-minlen:], a_content[:minlen]) if fuzzratio < min_ratio: fuzzratio = fuzz.ratio(a_content[-minlen:], rep_content[:minlen]) if fuzzratio >= min_ratio: rep_ids.append(i) matched = True break if not matched: rep_num += 1 rep_ids.append(rep_num) rep_articles.append(article) groups = [] # assign duplicate article groups to each article for i in range(len(rep_articles)): groups.append([article_list[j] for j in range(len(article_list)) if rep_ids[j] == i]) return groups
def fuzz_roadnames_num_test(roadn): fuzztestNUM = [] for cn in roadn: fuzztestNUM.append(fuzz.ratio(re.findall("\d+",cn),testnameNUM)) fuzztestNUM.append(fuzz.ratio(cn,testnameNUM)) fuzztestNUM = np.array(fuzztestNUM) maxratNUM = np.max(fuzztestNUM) indiesNUM = np.where(fuzztestNUM == maxratNUM) print '\n\n\n\n\nNUM TEST' for i in indiesNUM[0]: print roadn[i]
def xlparse(filepath,classname): wb = openpyxl.load_workbook(filepath) sheetnames = wb.get_sheet_names() #getting list of all sheetnames #section for dealing with sheet 1 which is consolidated statement of income sheet = wb.get_sheet_by_name(sheetnames[1]) cola = [] #create new list to store all values in column A colb = [] for cellobj in sheet.columns[0]: #iterate through all values where column0*header column is true.. cola.append(cellobj.value) #append them to list of columna so we can extract info out of it.. for cellobj in sheet.columns[1]: #iterate through all values in columnB and store in colb colb.append(cellobj.value) #extracting information on millions vs thousands from columnArowA #format 'Consolidated Condensed Statements of Income - USD ($) shares in Millions, $ in Millions' sharedeno = re.findall('shares in ([a-zA-Z]+)',cola[0]) dollardeno = re.findall('\$ in ([a-zA-Z]+)',cola[0]) #note the use of special character '\' to match for $ vs match at end of line.. #print sharedeno[0],dollardeno[0] #regular expression returns a list #netrevenue -cost ofsales = grossmargin #grossmargin - operating expenses = operating income #operating expenses = randd + marketing + restructuring + amoritization #operating income - gains(losses) on equity investments + iterests and other... = income before taxes #income before taxes - provision for taxes = net incomde #netincome/basic shares = basic earnings per share #netincome/diluted shares = diluted earnings per share of commone stock netrevenuelist = ['Net sales','Netsales','netsales','Net revenue','netrevenue'] #we will want to keep expanding this list based on how many different variants from different companies netincomelist = ['Net income','netincome'] #we will want to keep expanding this list based on how many different variants from different companies logger.debug('%s,%s',cola,colb) for name,value in zip(cola,colb): #looping through both cola and b at the same time .. maybe not efficient time wise and better to use index ?? for entry in netrevenuelist: if fuzz.ratio(entry,name) > 70: #TODO Add a check for blank cells logger.info('%s,%s',name,fuzz.ratio(entry,name)) netrevenue = value break for entry in netincomelist: if fuzz.ratio(entry,name) > 70: #TODO Add a check for blank cells logger.info('%s,%s',name,fuzz.ratio(entry,name)) netincome = value break #print netrevenue,netincome classname = quarterlyincome(netrevenue,netincome) #creating a new class with income vlaues return classname
def parse_track(spotify, line): search_text = line if search_text.count(" by ") == 1: search_text = search_text.replace(" by ", " ") if search_text.count(" - ") == 1: search_text = search_text.replace(" - ", " ") if search_text.count("-") == 1: search_text = search_text.replace("-", " ") if search_text.strip() == "": return None log(" Searching for " + search_text + " AND NOT Karaoke..", 3) try: spotify_login() results = spotify.search(search_text + " AND NOT Karaoke", limit=50, type='track') except Exception as err: log("Error searching for track", 1) log(str(err), 1) return None log(" Searching for track finished", 3) items = results['tracks']['items'] choices = [] track_hash = {} if len(items) > 0: for t in items: log(" Appending choice: " + t['artists'][0]['name'] + " " + t['name'], 3) choices.append(t['artists'][0]['name'] + " " + t['name']) track_hash[t['artists'][0]['name'] + " " + t['name']] = t log(" Appending choice: " + t['name'] + " " + t['artists'][0]['name'], 3) choices.append(t['name'] + " " + t['artists'][0]['name']) track_hash[t['name'] + " " + t['artists'][0]['name']] = t #choices.append(t['name']) #track_hash[t['name']] = t best_track = process.extractOne(search_text, choices) best_t = track_hash[best_track[0]] log(" Closest match: " + best_track[0] + " (" + str(best_track[1]) + ")" + " for comment [" + search_text + "]", 3) if fuzz.ratio(line, best_track[0]) > 50: log(" Returning track " + best_t['name'] + " for comment [" + line + "]", 2) return best_t else: log(" Fuzz ratio discarding '" + best_track[0] + "' with score: " + str(fuzz.ratio(line, best_track[0])), 3) return None
def compare_two_texts(self, string_a, string_b, normalize_value=True): """ Compare two string and return the value of Simple Ratio algorithm the value is normalized between 0 and 1 values. """ if ((isinstance(string_a, unicode) and isinstance(string_b, unicode)) or (isinstance(string_a, str) and isinstance(string_b, str))): if normalize_value: return self.__normalized_value(fuzz.ratio(string_a, string_b)) else: return fuzz.ratio(string_a, string_b) else: raise TypeError
def get_spotify_id(t): artist = t[0] track = t[1] url ="https://api.spotify.com/v1/search?q=%s+artist:%s&type=track&market=US&" % (track, artist) r = urllib.request.urlopen(url) data = json.load(r) PopId = namedtuple('PopId', 'pop spotify_id') popid_list = [] for i in range(0, len(data["tracks"]["items"])): if fuzz.ratio(artist, data["tracks"]["items"][i]["artists"][0]["name"].lower()) > 60: if fuzz.ratio(track, data["tracks"]["items"][i]["name"].lower()) >= 60: s_id = data["tracks"]["items"][i]["id"] return s_id raise ValueError('track not found')
def response_correct(response, answer): filtered_response = filter_words(response) filtered_answer = filter_words(answer) bracketless = strip_brackets(filtered_answer) no_whitespace_response = filtered_response.replace(" ", "") no_whitespace_answer = filtered_answer.replace(" ", "") no_whitespace_bracketless = bracketless.replace(" ", "") score = max( fuzz.token_sort_ratio(filtered_response, filtered_answer), fuzz.token_sort_ratio(filtered_response, bracketless), fuzz.ratio(no_whitespace_response, no_whitespace_answer), fuzz.ratio(no_whitespace_response, no_whitespace_bracketless) ) return score > 70
# print("Both O and X") # print("Average Precision:") # print("{0:<50f}{1:<50f}".format(ap, ap_old)) # past_avep += ap # past_avep_old += ap_old # Coverage Scoring all_recommendation_set = all_recommendation_set | set( top_5_recommendation_list) if query_item not in seen_query: seen_query.add(query_item) total_rec_num += len(top_5_recommendation_list) # Edit Distance Scoring edit_distance_query_to_rec = \ [fuzz.ratio(item_detail_map[query_item], item_detail_map[rec_item]) for rec_item in top_5_recommendation_list] ave_edit_distance_query_to_rec = sum( edit_distance_query_to_rec) / len(edit_distance_query_to_rec) total_ave_distance_query_to_rec += ave_edit_distance_query_to_rec print('=' * 20) print("Levenshtein Distance:") print("Average Distance (query to rec): {}".format( ave_edit_distance_query_to_rec)) edit_distance_rec_to_rec = \ [fuzz.ratio(item_detail_map[rec_item_first], item_detail_map[rec_item_second])\ for rec_item_first, rec_item_second in combinations(top_5_recommendation_list, 2)] ave_edit_distance_rec_to_rec = sum(edit_distance_rec_to_rec) / len( edit_distance_rec_to_rec) total_ave_distance_rec_to_rec += ave_edit_distance_rec_to_rec print("Average Distance (rec to rec): {}".format( ave_edit_distance_rec_to_rec))
def init_agents(population, string_length): return (Agent(string_length for _ in xrange(population)) #Now we need to figure out how we do our fitness: def fitness(agents): for agent in agents: agent,fitness = fuzz.ratio(agent.string, in_str) return agents #We also need a function to select who was the best: def selection (agents): #So this sorts our agents and then uses a lambda function to make sure it's sorted by agent fitness. agents = sorted(agents, key = lambda agent: agent.fitness, reverse = True) print(''.join(map(str, agents))) #I want half of the best agents to live. agents = agents[:int(0.5 * len(agents))] #This is a confusing function. But this is crossover. #The idea is mixing the qualities of the agents in the current iteration with the next iteration. #Previously we selected the most fit parents, so the average of the whole population should be better. def crossover(agents): #These are the babies for the next iteration: babies = [] #So for each difference in our number of agents and the population... divided by 2 for _ in xrange((population - len(agents)) / 2): #We randomly select parents... parent1 = random.choice(agents) parent2 = random.choice(agents) #We get our children... child1 = Agent(in_str_len) child2 = Agent(in_str_len) #We decide where that split is (randomly, that's the point) split = random.randint(0, in_str_len) #We create the string that the child represents child1.string = parent1.string[0:split] + parent2.string[split:in_str_len] child2.string = parent2.string[0:split] + parent1.string[split:in_str_len] #Now we have to append these to the babies list. babies.append(child1) babies.append(child2) #extend is weird. It just adds 2 lists. agents.extend(babies) return agents #Now all we need is our mutation function. I think just sport mutations will be fine: def mutation(agents): for agent in agents: for ix, param in enemerate(agent.string): if random.uniform(0.0, 1.0) <=0.1: agent.string = agent.string[0:idx] + random.choice(string.letters) + agent.string[idx+1:in_str_len] return agents if __name__ == '__main__': in_str = 'AndyMiller' in_str_len = len(in_str) genetic_algorithm()
master_data["Final_String_exist"]=master_data["Final_String_exist"]+" "+master_data[i].astype(str) print(master_data["Final_String_exist"][0]) new_data["NAD Key"]="" #output=pd.DataFrame() #new_data["Final_String_new"]=new_data["Final_String_new"] new_data["Final_String_exist"]="" new_data["NAD Key"]=np.NaN new_data["F_Ratio"]=np.NaN new_data["Indices"]=np.NaN output=pd.DataFrame() output=new_data count=1 for i in range(0,len(new_data)): for j in range(0,len(master_data)): f_ratio=fuzz.ratio(new_data["Final_String_new"][i],master_data["Final_String_exist"][j]) print(f_ratio) if f_ratio > 60: # new_data.loc[new_data.index[i],"Final_String_exist"]=master_data["Final_String_exist"][j] if new_data.loc[new_data.index[i],"NAD Key"] is np.NaN: new_data.loc[new_data.index[i],"F_Ratio"]=f_ratio new_data.loc[new_data.index[i],"NAD Key"]=master_data["NAD Key"][j] new_data.loc[new_data.index[i],"Final_String_exist"]=master_data["Final_String_exist"][j] new_data.loc[new_data.index[i],"Indices"]=i else: new_data=new_data.append({"NAD Key":master_data["NAD Key"][j],"F_Ratio":f_ratio,"Final_String_exist":master_data["Final_String_exist"][j],"Final_String_new":new_data["Final_String_new"][i],'Primise':new_data["Primise"][i], 'ThoroughFare No':new_data["ThoroughFare No"][i], 'ThoroughFare':new_data['ThoroughFare'][i], 'ThoroughFare Name':new_data['ThoroughFare Name'][i], 'Town':new_data['Town'][i], 'Postoutcode':new_data['Postoutcode'][i], 'PostIncode':new_data['PostIncode'][i], 'Country':new_data['Country'][i],'Indices':i},ignore_index=True) # new_data.loc[new_data.index[len(new_data)+1],"Final_String_exist"]=master_data["Final_String_exist"][j]
def load_clue(self, coords): if len(coords) != 2: self.display_board() c = coords[0].lower() r = coords[1].lower() if c not in ['a', 'b', 'c', 'd', 'e', 'f']: self.display_board() current_round = self.rounds[self.current_round] category = sorted(list(current_round.keys()))[ord(c) - ord('a')] all_values = set() for cat in current_round: for v in current_round[cat]: all_values.add(v) if (int(r) - 1) in all_values: val = sorted(all_values)[int(r) - 1] else: return self.display_board() if val not in current_round[category]: self.display_board() clue = current_round[category][val] if not clue['active']: self.display_board() players = [] for idx in range(1, len(self.players.keys()) + 1): players.append(f'{self.players[idx]["name"]} ({idx})') self.clear_screen() print('\n\n\n\n\n\n\n\n\n\n\n\n') if clue['daily_double']: self.print_centered('Daily Double!') self.print_centered(category) self.print_centered('\n\n\n') player = '' while player not in self.players: player = self.prompt_centered(f'Who\'s guessing? {"; ".join(players)}', suffix='\n ') if player.isdigit(): player = int(player) elif player == '': return if f' ({player})' not in ';'.join(players): player = '' continue self.print_centered('\n\n\n') wager = self.prompt_centered( f'How much are you wagering, {self.players[player]["name"]}?', suffix='\n ' ) self.print_centered(clue['text'], wrap=True) self.print_centered('\n\n') answer = clue['answer'].lower() guess = self.prompt_centered('What is ', suffix='\n ').lower() choice = '' print('\n\n') scores = [ fuzz.ratio(guess, answer), fuzz.partial_ratio(guess, answer), fuzz.token_sort_ratio(guess, answer), fuzz.token_set_ratio(guess, answer) ] if min(scores) == 100: self.print_centered('Correct. The official answer is:') choice = 's' elif max(scores) >= 75 and min(scores) >= 50: self.print_centered('Likely correct') elif max(scores) >= 50 and min(scores) >= 35: self.print_centered('Not far off') else: self.print_centered('Unlikely to be right') input() print('\n\n') self.print_centered(textwrap.fill(f'What is {clue["answer"]}?')) correct = '' while correct.lower() not in ['y', 'n']: correct = self.prompt_centered(f'Was {self.players[player]["name"]} right? Y/N', suffix='\n ') if correct.lower() == 'y': self.players[player]['points'] += int(wager) else: self.players[player]['points'] -= int(wager) self.rounds[self.current_round][category][val]['active'] = False else: self.print_centered(f'{category} for ${val}') self.print_centered('\n\n') self.print_centered(clue['text'], wrap=True) self.print_centered('\n\n') choice = '' while choice.lower() not in ['g', 's', 'c']: choice = self.prompt_centered('(G)uess, (S)how, (C)ancel', suffix='\n ') if choice.lower() == 'g': answer = clue['answer'].lower() guess = self.prompt_centered('What is ', suffix='\n ').lower() choice = '' print('\n\n') scores = [ fuzz.ratio(guess, answer), fuzz.partial_ratio(guess, answer), fuzz.token_sort_ratio(guess, answer), fuzz.token_set_ratio(guess, answer) ] if min(scores) == 100: self.print_centered('Correct. The official answer is:') choice = 's' elif max(scores) >= 75 and min(scores) >= 50: self.print_centered('Likely correct') elif max(scores) >= 50 and min(scores) >= 35: self.print_centered('Not far off') else: self.print_centered('Unlikely to be right') if choice.lower() == 's': self.print_centered(textwrap.fill(f'What is {clue["answer"]}?')) self.record_guess_result(val, players) self.rounds[self.current_round][category][val]['active'] = False active_clues = False for category in self.rounds[self.current_round]: for val in self.rounds[self.current_round][category]: if self.rounds[self.current_round][category].get(val, {}).get('active', False): active_clues = True if active_clues: self.display_board() else: self.display_interstitial()
def get_representatives(self, word, representatives, threshold=70): representatives = [ rep for rep in representatives if fuzz.ratio(word, rep) >= threshold ] return representatives
def find_component_match(self, title, body, template_data): '''Make a list of matching files for arbitrary text in an issue''' # DistributionNotFound: The 'jinja2<2.9' distribution was not found and # is required by ansible # File # "/usr/lib/python2.7/site-packages/ansible/plugins/callback/foreman.py", # line 30, in <module> STOPWORDS = [u'ansible', u'core', u'plugin'] STOPCHARS = [u'"', u"'", u'(', u')', u'?', u'*', u'`', u','] matches = [] if u'Traceback (most recent call last)' in body: lines = body.split(u'\n') for line in lines: line = line.strip() if line.startswith(u'DistributionNotFound'): matches = [u'setup.py'] break elif line.startswith(u'File'): fn = line.split()[1] for SC in STOPCHARS: fn = fn.replace(SC, u'') if u'ansible_module_' in fn: fn = os.path.basename(fn) fn = fn.replace(u'ansible_module_', u'') matches = [fn] elif u'cli/playbook.py' in fn: fn = u'lib/ansible/cli/playbook.py' elif u'module_utils' in fn: idx = fn.find(u'module_utils/') fn = u'lib/ansible/' + fn[idx:] elif u'ansible/' in fn: idx = fn.find(u'ansible/') fn1 = fn[idx:] if u'bin/' in fn1: if not fn1.startswith(u'bin'): idx = fn1.find(u'bin/') fn1 = fn1[idx:] if fn1.endswith(u'.py'): fn1 = fn1.rstrip(u'.py') elif u'cli/' in fn1: idx = fn1.find(u'cli/') fn1 = fn1[idx:] fn1 = u'lib/ansible/' + fn1 elif u'lib' not in fn1: fn1 = u'lib/' + fn1 if fn1 not in self.files: if C.DEFAULT_BREAKPOINTS: logging.error(u'breakpoint!') import epdb epdb.st() if matches: return matches craws = template_data.get(u'component_raw') if craws is None: return matches # compare to component mapping matches = self._string_to_cmap_key(craws) if matches: return matches # do not re-process the same strings over and over again if craws.lower() in self.match_cache: return self.match_cache[craws.lower()] # make ngrams from largest to smallest and recheck blob = TextBlob(craws.lower()) wordcount = len(blob.tokens) + 1 for ng_size in reversed(xrange(2, wordcount)): ngrams = [u' '.join(x) for x in blob.ngrams(ng_size)] for ng in ngrams: matches = self._string_to_cmap_key(ng) if matches: self.match_cache[craws.lower()] = matches return matches # https://pypi.python.org/pypi/fuzzywuzzy matches = [] for cr in craws.lower().split(u'\n'): ratios = [] for k in self.CMAP.keys(): ratio = fw_fuzz.ratio(cr, k) ratios.append((ratio, k)) ratios = sorted(ratios, key=lambda tup: tup[0]) if ratios[-1][0] >= 90: cnames = self.CMAP[ratios[-1][1]] matches += cnames if matches: self.match_cache[craws.lower()] = matches return matches # try to match to repo files if craws: clines = craws.split(u'\n') for craw in clines: cparts = craw.replace(u'-', u' ') cparts = cparts.split() for idx, x in enumerate(cparts): for SC in STOPCHARS: if SC in x: x = x.replace(SC, u'') for SW in STOPWORDS: if x == SW: x = u'' if x and u'/' not in x: x = u'/' + x cparts[idx] = x cparts = [x.strip() for x in cparts if x.strip()] for x in cparts: for f in self.files: if u'/modules/' in f: continue if u'test/' in f and u'test' not in craw: continue if u'galaxy' in f and u'galaxy' not in body: continue if u'dynamic inv' in body.lower( ) and u'contrib' not in f: continue if u'inventory' in f and u'inventory' not in body.lower( ): continue if u'contrib' in f and u'inventory' not in body.lower( ): continue try: f.endswith(x) except UnicodeDecodeError: continue fname = os.path.basename(f).split(u'.')[0] if f.endswith(x): if fname.lower() in body.lower(): matches.append(f) break if f.endswith(x + u'.py'): if fname.lower() in body.lower(): matches.append(f) break if f.endswith(x + u'.ps1'): if fname.lower() in body.lower(): matches.append(f) break if os.path.dirname(f).endswith(x): if fname.lower() in body.lower(): matches.append(f) break logging.info(u'%s --> %s' % (craws, sorted(set(matches)))) self.match_cache[craws.lower()] = matches return matches
for link in soup.findAll('a'): href = link['href'] if not href in links_detected: if href.startswith('http'): # Filter if url.split('/')[2] in href: links_detected.append(href) # If requested data found in url elif query.lower() in href.lower(): print(Fore.GREEN + '--- Requested data found at link : ' + href) links_detected.append(href) if saveInFile: with open(query + ".txt", "a") as file: file.write(href + "\n") # If text in link and link location is similar elif fuzz.ratio(link.text, href) >= 60: print(Fore.GREEN + '--- Text and link are similar : ' + href) links_detected.append(href) if saveInFile: with open(query + ".txt", "a") as file: file.write(href + "\n") except: continue if links_detected == []: print(Fore.RED + '--- No data found') #for s in links_detected: print(s)
def main(): parser = argparse.ArgumentParser( prog="KeyCrypt", description="Secure Password Manager With GPG Encryption", epilog="KeyCrypt Copyright (C) 2018 Akshay R. Kapadia") subparsers = parser.add_subparsers(dest="command") # Primary command add_subparser = subparsers.add_parser("add") delete_subparser = subparsers.add_parser("delete") edit_subparser = subparsers.add_parser("edit") find_subparser = subparsers.add_parser("find") login_subparser = subparsers.add_parser("login") see_subparser = subparsers.add_parser("see") backup_subparser = subparsers.add_parser("backup") restore_subparser = subparsers.add_parser("restore") settings_subparser = subparsers.add_parser("settings") nuke_subparser = subparsers.add_parser("nuke") # Add account parser add_subparser.add_argument("name", help="Name of the account", type=str) add_subparser.add_argument("-r", "--random-password", help="Generates a random ASCII password of the specified length", type=int) # Delete account parser delete_subparser.add_argument("name", help="Name of the account", type=str) # Edit account parser edit_subparser.add_argument("name", help="Name of the account", type=str) edit_subparser.add_argument("-pv", "--password-visible", help="Makes the password visible with the account data is shown", action="store_true") # Find account parser find_subparser.add_argument("name", help="Name of the account", type=str) find_subparser.add_argument("-pv", "--password-visible", help="Makes the password visible with the account data is shown", action="store_true") # Autologin parser login_subparser.add_argument("name", help="Name of the account", type=str) # See category parser see_subparser.add_argument( "category", help="The category that you want to see", type=str) see_subparser.add_argument("-pv", "--password-visible", help="Makes the password visible with the account data is shown", action="store_true") # Backup Parser backup_subparser.add_argument( "-d", "--delete", help="Deletes the original copy of the KeyCrypt data", action="store_true") backup_subparser.add_argument( "path", help="The path to the destination directory (Enter '?' to open the directory chooser", type=str) # Restore parser restore_subparser.add_argument( "-d", "--delete", help="Deletes the backed up copy of the KeyCrypt data", action="store_true") restore_subparser.add_argument( "-m", "--merge", help="Merges the accounts in the backup file with your current KeyCrypt", action="store_true") restore_subparser.add_argument( "path", help="The path to the directory where the backup is located (Enter '?' to open the directory chooser)", type=str) args = vars(parser.parse_args()) keycrypt = KeyCrypt() try: if args["command"] == "nuke": confirmation = True if (str(input(colored( "Are You Sure You Want To Permanently Nuke The KeyCrypt (y/N): ", "red"))).lower() in ["y", "yes"]) else False if confirmation: confirmation_key = KeyCrypt.generate_password( 30, regenerate=False) typed_confirmation_key = str( input(colored("Type ", "red") + colored(str(confirmation_key), "yellow") + colored(" To Nuke The KeyCrypt: ", "red"))) if typed_confirmation_key == confirmation_key: call(["shred", "-u", ".KeyCryptData.txt"]) print(colored("KeyCrypt Successfully Nuked", "green")) else: print(colored("KeyCrypt Nuke Cancelled", "red")) else: print(colored("KeyCrypt Nuke Cancelled", "red")) else: keycrypt.gpg_name = str(input("Name Associated With GPG Key: ") ) if keycrypt.gpg_name is None else keycrypt.gpg_name if args["command"] == "backup": path = args["path"] if path == "?": tk.Tk().withdraw() path = askdirectory() if path == "": raise tk.TclError else: keycrypt.backup(path) if args["delete"]: call(["shred", "-u", ".KeyCryptData.txt"]) else: keycrypt.save() print(colored("KeyCrypt Successfully Backed Up", "green")) else: try: if args["command"] is None: banner() KeyCrypt.update(keycrypt) for account in keycrypt.accounts: account.show_account(keycrypt.wifi_permission) elif args["command"] == "restore": path = args["path"] if path == "?": tk.Tk().withdraw() path = askdirectory() if path == "": raise tk.TclError else: if args["merge"]: old_accounts = keycrypt.accounts keycrypt = KeyCrypt(path) for account_x in old_accounts: duplicate = False for account_y in keycrypt.accounts: if account_x.equals(account_y): duplicate = True if not duplicate: keycrypt.add_account(account_x) else: keycrypt = KeyCrypt(path) if args["delete"]: call(["shred", "-u", path + "/KeyCryptDataBackup.txt.gpg"]) print(colored("KeyCrypt Successfully Restored", "green")) elif args["command"] == "settings": print(colored("Settings", "red")) if keycrypt.wifi_permission: print("Wifi Permission (Security Status & Autologin): " + colored(keycrypt.wifi_permission, "green")) else: print("Wifi Permission (Security Status & Autologin): " + colored(keycrypt.wifi_permission, "red")) if keycrypt.passwords_visible: print("Passwords Visible: " + colored(keycrypt.passwords_visible, "green")) else: print("Passwords Visible: " + colored(keycrypt.passwords_visible, "red")) setting = str(input("Setting: ")).lower().capitalize() for defined_setting in ["Wifi Permission", "Passwords Visible"]: if fuzz.partial_ratio(setting, defined_setting) >= 50: setting = defined_setting if setting not in ["GPG Name", "Wifi Permission", "Passwords Visible"]: raise InvalidSettingError if setting == "Wifi Permission": keycrypt.wifi_permission = not keycrypt.wifi_permission if keycrypt.wifi_permission: print("Wifi Permission (Security Status & Autologin): " + colored(keycrypt.wifi_permission, "green")) else: print("Wifi Permission (Security Status & Autologin): " + colored(keycrypt.wifi_permission, "red")) else: keycrypt.passwords_visible = not keycrypt.passwords_visible if keycrypt.passwords_visible: print("Passwords Visible: " + colored(keycrypt.passwords_visible, "green")) else: print("Passwords Visible: " + colored(keycrypt.passwords_visible, "red")) elif args["command"] == "add": username = str(input("Username: "******"random_password"]) if args["random_password"] is not None else getpass.getpass("Password: "******"Category (Email, Web, Social, Banking, Computer, Other): ")).lower().capitalize() for defined_category in ["Email", "Web", "Social", "Banking", "Computer", "Other"]: if fuzz.ratio(category, defined_category) >= 70: category = defined_category if category not in ["Email", "Web", "Social", "Banking", "Computer", "Other"]: raise InvalidCategoryError url = str( input("Url (Use Login Page For Autologin)(Start With 'https://'): ")) account = Account(args["name"], username, password, url, category, keycrypt) keycrypt.add_account(account) account.show_account(False) print( colored(args["name"] + " Account Successfully Created", "green")) autologin = False if ( str(input("Configure Autologin (Y/n): ")) in ["n", "no"]) else True if autologin: if KeyCrypt.wifi_enabled(keycrypt.wifi_permission): account.configure_autologin() else: account.autologin = False raise NoInternetError if (account.username_id is None or account.password_id is None): account.autologin = False else: account.autologin = True elif args["command"] == "see": if keycrypt.passwords_visible: args["password_visible"] = True args["category"] = ( args["category"].lower()).capitalize() for defined_category in ["Email", "Web", "Social", "Banking", "Computer", "Other", "All"]: if fuzz.ratio(args["category"], defined_category) >= 70: args["category"] = defined_category if args["category"] not in ["Email", "Web", "Social", "Banking", "Computer", "Other", "All"]: raise InvalidCategoryError if args["category"] == "All": KeyCrypt.update(keycrypt) for account in keycrypt.accounts: account.show_account(keycrypt.wifi_permission, args["password_visible"]) else: for account in keycrypt.accounts: if account.category == args["category"]: account.update_security_status(keycrypt) account.show_account(keycrypt.wifi_permission, args["password_visible"]) else: account = keycrypt.find_account(args["name"]) if args["command"] == "delete": account.show_account(False, False) confirmation = True if (str(input(colored("Are You Sure You Want To Permanently Delete Your " + account.name + " Account (y/N): ", "red"))).lower() in ["y", "yes"]) else False if confirmation: confirmation_key = KeyCrypt.generate_password( 15, False) typed_confirmation_key = str( input(colored("Type ", "red") + colored(str(confirmation_key), "yellow") + colored(" To Delete Your " + account.name + " Account: ", "red"))) if typed_confirmation_key == confirmation_key: keycrypt.delete_account(account) print(colored(account.name + " Account Deleted", "green")) else: print( colored("Account Deletion Cancelled", "red")) else: print(colored("Account Deletion Cancelled", "red")) elif args["command"] == "edit": if keycrypt.passwords_visible: args["password_visible"] = True account.update_security_status(keycrypt) account.show_account( keycrypt.wifi_permission, args["password_visible"]) attribute = str(input("Attribute: ") ).lower().capitalize() for defined_attribute in ["Name", "Username", "Password", "Url", "Category", "Autologin"]: if fuzz.ratio(attribute, defined_attribute) >= 70: attribute = defined_attribute if attribute not in ["Name", "Username", "Password", "Url", "Category", "Autologin"]: raise InvalidAttributeError account.edit_account(attribute, keycrypt) account.update_security_status(keycrypt) account.show_account( keycrypt.wifi_permission, args["password_visible"]) print( colored(attribute + " Successfully Edited", "green")) elif args["command"] == "login": if KeyCrypt.wifi_enabled(keycrypt.wifi_permission): if (account.autologin and account.username_id is not None and account.password_id is not None): print(colored("Logging Into Your " + account.name + " Account...", "red")) account.login() print( colored("Successfully Entered Login Information", "green")) else: raise AccountNotConfiguredError( "Account Is Not Configured For Autologin", account) else: raise NoInternetError elif args["command"] == "find": if keycrypt.passwords_visible: args["password_visible"] = True for account in keycrypt.find_account(args["name"], True): account.update_security_status(keycrypt) account.show_account(keycrypt.wifi_permission, args["password_visible"]) except InvalidCategoryError: print(colored("Invalid Category", "red")) print(colored( "Categories: Web, Social, Computer, Banking, Email, Other (, All)", "red")) except InvalidAttributeError: print(colored("Invalid Account Attribute", "red")) print( colored("Attributes: Name, Username, Password, Url, Category", "red")) except InvalidSettingError: print(colored("Invalid Setting", "red")) print( colored("Settings: GPG Name, Wifi Permission, Passwords Visible", "red")) except tk.TclError: print(colored("Invalid Directory", "red")) except NoInternetError: print(colored("No Internet Connection", "red")) except WebDriverException: if "gecko" in str(WebDriverException): print(colored("'geckodriver' Not Installed", "red")) else: print(colored("Incorrect Account Information", "red")) finally: if isfile("geckodriver.log"): call(["shred", "-u", "geckodriver.log"]) keycrypt.save() except FileNotFoundError: print(colored("File Not Found", "red")) except ValueError: print(colored("Invalid Input, Try Again", "red"))
conn.row_factory = lambda cursor, row: row[0] c = conn.cursor() conn.row_factory = sqlite3.Row cur = conn.cursor() cur.execute("SELECT * FROM pokemons;") schema = cur.fetchone() names = c.execute("SELECT Nom FROM pokemons;").fetchall() request = "" if choice == "A": npt = input("\nEntrer le nom du pokémon\n> ") npt = unidecode(npt.title().strip()) if npt not in names: for n in names: if fuzz.ratio(npt, n) > 80: request = f"SELECT * FROM pokemons WHERE Nom = '{n}';" break else: request = f"SELECT * FROM pokemons WHERE Nom = '{npt}';" else: npt = input("\nEntrer l'ID du pokémon\n> ") while len(npt) != 3: npt = "0" + npt request = f"SELECT * FROM pokemons WHERE ID = '{npt}';" print("\n") try: data = crsr.execute(request).fetchall() if data: for field, d in zip(schema.keys(), data[0]):
from nltk import ngrams from fuzzywuzzy import fuzz ##### Creating required variables ##### stringlist = [] scorelist = [] ##### Setting up review text and string value ##### text = "DON'T BUY THIS LAPTOP. THEY WILL SEND YOU A DEFECTIVE PIECE. I'M REGRETTING BIG TIME. I purchased this on 1st July 2020 and\ within 15 days the keyboard just stopped working Engineer suspected hardware issue. I demanded replacement as for a fact they sent a defective\ piece but amazon and lenovo both denied. Lenovo should be banned from our country. I regret for not saving enough money and buying another\ brand. I regret I got carried away with the new launch. This laptop doesn't even deserve one star. Amazon has also lost my trust when\ it comes to buying electronic items online. And lenovo can't even comment on their cheap quality product. DO NOT BUY.. " str1 = 'keyboard stopped working' ##### Breaking text in required ngrams ##### n = 4 phrase = ngrams(text.lower().split(), n) ##### Joining n-gram tuples as strings, and then comparing with the string2 ##### for grams in phrase: str2 = ' '.join(grams) stringlist.append(str2) x = fuzz.ratio(str2, str1.lower()) scorelist.append(x)
def ratio (a, b) : if contains_multiple_words(a): return fuzz.partial_ratio(a.lower(), b.lower()) else : return fuzz.ratio(a.lower(), b.lower())
similarities = [] with open(args.outputname, 'w') as f: i = 0 # for sentence in tqdm(sample_sentences): for sentence in sample_sentences: print(i, "/", len(sample_sentences)) closest_sentences[i] = [] max_so_far = -1.0 closest_sentence = "" for close_sentence in full_sentence_list: r = float(fuzz.ratio(sentence, close_sentence)) / 100.0 # SUPER SLOW >> r = (difflib.SequenceMatcher(None, sentence, close_sentence).ratio()) if r >= max_so_far: max_so_far = r closest_sentence = close_sentence closest_sentences[i].append([r, closest_sentence]) similarities.append(max_so_far) ### """ if max_so_far > 0.8: print(i) print("Generated sentence ",i," \"", sentence, "\" has closest:") print(max_so_far,":",closest_sentence) print("-----------------")
nickname = Jackaroo Jack -- a, r, o, o <- edit distance = 4 J,a,c,k J,a,c,k,a,r,o,o <- total characters = 12 <- 8 of 12 match <- fuzzy string matching = 66.67% ''' print('The Calculation is: {}'.format(calculation)) print('Edit Distance is:') print(edit_distance(name, nickname)) print('\n') print('Fuzzy Matching Percentage is: \n') print('{}'.format(fuzz.ratio(name, nickname)), '%') print('\n') sents = '''It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity, it was the season of Light, it was the season of Darkness, it was the spring of hope, it was the winter of despair, we had everything before us, we had nothing before us, we were all going direct to Heaven,
firstLineList = [] ##file location where you want to place the concatenated txt files with open("uf" + "/" + book + ".txt", 'w') as outfile: for txtfile in textfiles: print(txtfile) fullName = path + "/" + book + "/" + txtfile headerStatus = False with codecs.open(fullName, "r", encoding='utf-8', errors='ignore') as infile: lineNum = 0 for line in infile: if (lineNum == 0): for firstLine in firstLineList: if (fuzz.ratio(firstLine, line) > 60 and len(line) > 5): headerStatus = True print(line) break firstLineList.append(line) try: if (lineNum == 0): if (headerStatus == False): outfile.write(line) else: outfile.write(line) except UnicodeEncodeError: if (lineNum == 0): if (headerStatus == False):
def fitness(agents): for agent in agents: agent.fitness = fuzz.ratio(agent.string, in_str) return agents
matchScore = matchScore.append(pd.DataFrame({ 'partyid1':row['PARTY_ID_1'], 'partyid2':row['PARTY_ID_2'], #'full_name1':row['FULL_NAME_1'], #'full_name2':row['FULL_NAME_2'], #'nameScore':fuzz.token_set_ratio(str(row['FULL_NAME_1']).upper(),str(row['FULL_NAME_2']).upper())/100 , 'first_name1':row['FIRSTNAME_1'], 'first_name2':row['FIRSTNAME_2'], 'firstnameScore':fuzz.token_set_ratio(str(row['FIRSTNAME_1']).upper(),str(row['FIRSTNAME_2']).upper())/100 , #'nameScore':sorted_levenshtein_rate(str(row['FULL_NAME_1']).upper(),str(row['FULL_NAME_2']).upper()) , 'last_name1':row['LASTNAME_1'], 'last_name2':row['LASTNAME_2'], 'lastnameScore':fuzz.token_set_ratio(str(row['LASTNAME_1']).upper(),str(row['LASTNAME_2']).upper())/100 , 'mobile1':row['MOBILE_1'], 'mobile2':row['MOBILE_2'], 'mobileScore': fuzz.ratio(str(row['MOBILE_1']),str(row['MOBILE_2']))/100, 'private1':row['PRIVATE_1'], 'private2':row['PRIVATE_2'], 'privateScore': fuzz.ratio(str(row['PRIVATE_1']),str(row['PRIVATE_2']))/100, 'work1':row['WORK_1'], 'work2':row['WORK_2'], 'workScore': fuzz.ratio(str(row['WORK_1']),str(row['WORK_2']))/100, 'email1':row['ELECTRONIC_ADDRESS_1'], 'email2':row['ELECTRONIC_ADDRESS_2'], 'emailScore': fuzz.partial_ratio(str(row['ELECTRONIC_ADDRESS_1']).upper(),str(row['ELECTRONIC_ADDRESS_2']).upper())/100, 'addressLine1':row['ST_ADDRESS_LINE_1'], 'addressLine2':row['ST_ADDRESS_LINE_2'], 'addressLineScore': fuzz.token_sort_ratio(str(row['ST_ADDRESS_LINE_1']).upper(),str(row['ST_ADDRESS_LINE_2']).upper())/100, 'city1':row['ST_CITY_1'], 'city2':row['ST_CITY_2'],
def process_venue(i, l, db, curId): inserted_venue = 0 # Get Venue Information venue = {} venue["tm_venue_id"] = l["id"] venue["venue_name"] = l["name"].replace("\'", "") if i % 50 == 0: logger.info('Processing venue #%s', i + 1) logger.info('Venue: %s, id: %s', venue["venue_name"], venue["tm_venue_id"]) exist = [] if "address" in l.keys(): if bool(l["address"]): (dummy, venue["venue_address"]) = l['address'].popitem() venue["venue_address"] = format_address(venue["venue_address"]) else: venue["venue_address"] = None if venue["venue_address"] is not None: exist = db.query( "SELECT * FROM city.venues where venue_add_comp = $1", venue["venue_address"]).getresult() else: exist = [] venue["venue_address"] = None venue["venue_add_comp"] = venue["venue_address"] if exist == [] and venue["venue_name"] is not None: names = db.query("SELECT venue_name FROM city.venues").getresult() for name in names: n = name[0].replace("\'", "") if (fuzz.ratio(n, venue["venue_name"]) > 80 or fuzz.ratio(n, venue["venue_name"]) > 60 and fuzz.partial_ratio(n, venue["venue_name"]) > 90): exist = db.query( "SELECT * FROM city.venues where venue_name = $1", n).getresult() if exist == []: if (venue['venue_name'].find('TBA') > 0 or venue['venue_name'].find('Vary By') > 0): venue["id"] = 2 else: logger.info('INSERT VENUE: %s', venue['venue_name']) logger.debug('Address: %s', venue["venue_address"]) curId = curId + 1 venue["id"] = curId if "location" in l.keys() and "postalCode" in l.keys(): logger.debug('Postal Code: %s', l["postalCode"]) lat = l["location"].get("latitude", 0) lon = l["location"].get("longitude", 0) logger.debug('Coords: (%s, %s)', lat, lon) if venue["venue_address"] is not None and lat != 0 and lon != 0: add = venue["venue_address"] + ", Toronto, ON " add += l["postalCode"] + ", Canada" elif int(lat) != 0 and int(lon) != 0: coord = str(lat) + ',' + str(lon) try: (venue["venue_add_comp"], add) = rev_geocode(coord) except AddressParserException as ape: logger.error(ape) elif venue["venue_address"] is not None: (add, lat, lon) = geocode(venue["venue_address"]) else: add = None else: (add, lat, lon) = geocode(venue["venue_address"]) venue["venue_address"] = add venue["lat"] = lat venue["lon"] = lon venue["capacity"] = None db.insert('city.venues', venue) inserted_venue += 1 else: for venue["id"] in exist[0]: if type(venue["id"]) == int: break return venue, inserted_venue
async def _(bot: Bot, event: Event): try: item = event.get_message().__str__().strip() if item == "": await wm.send("参数不能为空!") return #物品等级分离 item = item.replace(",", ",") mod_rank = 0 if "," in item: mod_rank = item.split(",")[1] if not mod_rank.isdigit(): await wm.send("参数错误") mod_rank = int(mod_rank) item = item.split(",")[0] #相似度匹配 index = 0 similar = 0 for i in range(len(WF_Sale)): if " " in item: s = fuzz.ratio(item, WF_Sale[i]["zh"]) else: s = fuzz.ratio(item, WF_Sale[i]["zh"].replace(" ", "")) if s > similar: similar = s index = i if similar < 50: await wm.send("未找到该物品,请缩小范围!") return await wm.send("正在查询 [{0}] 价格,请稍等".format(WF_Sale[index]["zh"])) response = requests.get( url="https://api.warframe.market/v1/items/{0}/orders".format( WF_Sale[index]["code"])) data = json.loads(response.text) #筛选并排序 rank = [] for i in data["payload"]["orders"]: if not i["visible"] or i["user"]["status"] != "ingame" or i[ "order_type"] != "sell" or "mod_rank" in i.keys( ) and i["mod_rank"] < mod_rank: continue if not len(rank): rank.append(i) continue for j in range(len(rank)): if i["platinum"] <= rank[len(rank) - 1 - j]["platinum"]: if (len(rank) - 1 - j) == 0: rank.insert(len(rank) - 1 - j, i) break else: continue else: rank.insert(len(rank) - j, i) break if len(rank) > 10: del rank[len(rank) - 1] nodes = [] node = { "type": "node", "data": { "uin": f"{bot.self_id}", "name": "ZANUKA" } } node["data"][ "content"] = "查价物品: {0} ({1})\n数据来源: https://warframe.market/items/{2}".format( WF_Sale[index]["zh"], WF_Sale[index]["en"], WF_Sale[index]["code"]) nodes.append(copy.deepcopy(node)) #简略 is_mod = False content = "——————————————————————————————" for i in rank: if "mod_rank" in i.keys(): is_mod = True if is_mod: content += "\n—单价: {0} —数量: {1} —等级: {2} —卖家: {3} —声誉: {4}".format( int(i["platinum"]), i["quantity"], i["mod_rank"], i["user"]["ingame_name"], i["user"]["reputation"]) else: content += "\n—单价: {0} —数量: {1} —卖家: {2} —声誉: {3}".format( int(i["platinum"]), i["quantity"], i["user"]["ingame_name"], i["user"]["reputation"]) content += "\n——————————————————————————————" node["data"]["content"] = content nodes.append(copy.deepcopy(node)) #详细 for i in rank: content = "——————————————————————————————" if is_mod: content += "\n—单价: {0} —数量: {1} —等级: {2}".format( int(i["platinum"]), i["quantity"], i["mod_rank"]) else: content += "\n—单价: {0} —数量: {1}".format( int(i["platinum"]), i["quantity"]) content += "\n—卖家: {0} —声誉: {1} —地区: {2}".format( i["user"]["ingame_name"], i["user"]["reputation"], i["user"]["region"]) utcTime = datetime.strptime(i["creation_date"], "%Y-%m-%dT%H:%M:%S.%f+00:00") localTime = ( utcTime + timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S") content += "\n—创建时间: " + localTime utcTime = datetime.strptime(i["last_update"], "%Y-%m-%dT%H:%M:%S.%f+00:00") localTime = ( utcTime + timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S") content += "\n—上次更新: " + localTime utcTime = datetime.strptime(i["user"]["last_seen"], "%Y-%m-%dT%H:%M:%S.%f+00:00") localTime = ( utcTime + timedelta(hours=8)).strftime("%Y 年 %m 月 %d 日 %H:%M:%S") content += "\n—上次来看: " + localTime content += "\n——————————————————————————————" if is_mod: content += "\n/w {0} Hi! I want to buy: {1} (rank {2}) for {3} platinum. (warframe.market)".format( i["user"]["ingame_name"], WF_Sale[index]["en"], i["mod_rank"], i["platinum"]) else: content += "\n/w {0} Hi! I want to buy: {1} for {2} platinum. (warframe.market)".format( i["user"]["ingame_name"], WF_Sale[index]["en"], i["platinum"]) node["data"]["content"] = content nodes.append(copy.deepcopy(node)) await bot.send_group_forward_msg(group_id=event.group_id, messages=nodes) except Exception as e: print(e) await wm.finish("获取失败,请重试!")
import time phrase = 'She sells seashells by the seashore' ans = True while ans: print( '\nInstructions: You will be shown a phrase, and you have to type it as fast and accurate as possible!' ) print('\nAre you ready? (Press any key to continue)') ans = input('>> ') if ans: print(phrase) start = time.time() result = input('>> ') end = time.time() Ratio = fuzz.ratio(phrase, result) print('Your accuracy was ' + str(Ratio) + '%!') print('Your time was ' + "{:.2f}".format(end - start) + ' seconds!') ans2 = True while ans2: print('\nDo you want to try again? (Y/N)') ans2 = input('>> ') if ans2.lower() == 'n': ans2 = False ans = False print('Bye!') elif ans2.lower() == 'y': ans2 = False ans = True else: print('Invalid answer. Please try again.')
def fuzzy_match(word1, word2): return fuzz.ratio(word1, word2) / 100
def handle(self, *args, **options): rx = Record.objects.all() all = rx.count() cnt = 0 print "Iterating over " + str( all) + " database records, starting at " + str(options['start']) for i, r1 in enumerate(rx): # Obey start position argument if i < options['start']: continue for j, r2 in enumerate(rx): if j <= i: continue ratio = fuzz.ratio(r1.name, r2.name) if ratio < 75: continue if r1.person_id == r2.person_id: continue if r1.country != r2.country: continue if r1.gender != r2.gender: continue # Print leftovers: print "" print u"Score: {0:3d} {1:30}{2}".format( ratio, r1.name, r2.name) print u"Person-ID: {1:30}{2}".format( ratio, r1.person_id, r2.person_id) print u"Follow-up: {0!r:<30}{1}".format( r1.follow_up_case, r2.follow_up_case) print u"Date intervention: {0:30}{1}".format( str(r1.date_intervention), str(r2.date_intervention)) print u"Issue area: {0:30}{1}".format( r1.issue_area, r2.issue_area) print u"Activities: {0:30}{1}".format( r1.relevant_activities, r2.relevant_activities) if Record.objects.filter(pk=r1.pk, follow_ups__pk=r2.pk).exists(): print u"Relation exists? ************** YES ****************" else: print u"Relation exists? .............. NO ................" while True: data = str( raw_input( "(a)dd, (r)emove relation, (s)kip or (p)ause: ")) if data.lower() not in ('a', 'r', 's', 'p'): print("Not an appropriate choice.") else: break if data == "a": r1.follow_ups.add(r2) r1.save() elif data == "r": r1.follow_ups.remove(r2) r1.save() elif data == "s": continue elif data == "p": print "Restart with argument: " + str(i) self.stdout.write(self.style.SUCCESS('Paused at %i' % i)) return cnt += 1 print "Status: {:2.1f}".format((100.0 * i) / all) self.stdout.write( self.style.SUCCESS('Successfully edited all fuzzy relations'))
def match_check(x, checklist, fuzzy_thresh=70): res = any(map(lambda c: fuzz.ratio(x, c) > fuzzy_thresh, checklist)) return res
def process_text(str_cmp, str_exact): return fuzz.ratio(str_exact, str_cmp)
def matching1(tagged, shas, i, j, k, index, daf, amud, strings=15, ratio=False): short = 0 fuzzed = 0 if len(tagged) >= 15: string = " ".join(tagged[0:strings]) else: short += 1 string = " ".join(tagged[0:len(tagged) - 1]) #print string, daf, amud string = re.sub(ur'[\[\]\*#@[0-9]', "", string) found = 0 for counter, line in enumerate(shas[index], start=1): if fuzz.partial_ratio(string, line) > 80: bingo = counter if ratio is True: if fuzz.ratio(string, line) > 60: fuzzed += 1 found += 1 #print "ratio", string, daf, strings, fuzz.ratio(string, line) else: found += 1 #if fuzzed > 0: #print "fuzzed", fuzzed if found < 1 and strings != 0: strings -= 1 matching1(tagged, shas, i, j, k, index, daf, amud, strings) return elif found > 1: if ratio is True: error = "found too much, " + str(found) + "," + "on," + str( daf) + amud + "," + " ".join( tagged[0:15]).encode('utf-8') + "\n" log.write(error) longlog.write(error) else: matching1(tagged, shas, i, j, k, index, daf, amud, strings, True) return elif found == 1: roash = "Rosh on %s." % masechet + str(k + 2) + "." + str( i + 1) + "." + str(j + 1) talmud = "%s." % masechet + str(daf) + amud + "." + str(bingo) links.append(link(talmud, roash)) print roash, talmud succes = "found" + ", " + string.encode('utf-8') + str( daf) + amud + "," + str(strings) + "\n" # print succes longlog.write(succes) elif strings == 0: error = "did not find on daf," + str(daf) + amud + "," + " ".join( tagged[0:15]).encode('utf-8') + "\n" log.write(error) longlog.write(error)
input_file_name = "t_sample_80.txt" input_file_name2 = "smi_superset1.txt" output_file_name = "sample_result_ver_2.txt" # Output file f_out = open(output_file_name, 'w') with open(input_file_name) as t, open(input_file_name2, 'rb') as s: t_content = t.readlines() t_content = [y.strip() for y in t_content] s_content = pickle.load(s) f_out.write("ver_2.0 \n ===============\n") for idx, i in enumerate(range(len(t_content))): for j in range(len(s_content)): if fuzz.ratio(t_content[i], s_content[j][0]) > 80: f_out.write("%d\n" % idx) f_out.write("transcript : " + t_content[i] + "\n") f_out.write("smi : " + s_content[j][0] + "\n") f_out.write("%d %d" % (s_content[j][1], s_content[j][2])) f_out.write("\n") f_out.write("score : " "%d" % fuzz.ratio(t_content[i], s_content[j][0]) + "\n") f_out.write("===================\n") # print idx # print "transcript :" + t_content[i] # print "smi :" + s_content[t_best][0] # print s_content[t_best][1], "- ", s_content[t_best][2] # print "score", + best_score # print "------------"
def fuzzymatching(target, threshold): cell = [] for t in target: if t not in cell: cell.append(t) com = sorted(cell) comcom = [] similarity = [] for i in range(len(com)): second = [] third = [] if i + 10 <= len(com): for j in range(i, i + 10): t = fuzz.ratio(com[i], com[j]) if t > threshold: second.append(com[j]) third.append(t) else: for j in range(i, len(com)): t = fuzz.ratio(com[i], com[j]) if t > threshold: second.append(com[j]) third.append(t) comcom.append(second) similarity.append(third) t = [] newrow = [] lst = [] col = 0 for i in range(len(com)): found_flag = False for row, lst in enumerate(t): for col, sim_company in enumerate(lst): if com[i] == sim_company: found_flag = True newrow = list(set(t[row] + comcom[i])) t[row] = newrow if found_flag == False: t.append(comcom[i]) total = filter(None, t) size = [] for i in total: size.append(len(i)) counts = target.value_counts() mail_count = counts.to_dict() mail_count = {k: v for k, v in mail_count.items() if k} tail_num = [] for i in total: tt = [] for j in i: t = mail_count.get(j) tt.append(t) tail_num.append(tt) my = {'maillist': total, 'size': tail_num, 'num': size} mymail = pd.DataFrame(my) pop_com = [] for cell in range(len(tail_num)): m = tail_num[cell] r = max(m) t = m.index(r) n = [total[cell][t]] x = size[cell] y = n * x for comcell in y: pop_com.append(comcell) comcom = [] for cell_i in total: for cell_j in cell_i: comcom.append(cell_j) md = {'original_com': comcom, 'clean_com': pop_com} myDF = pd.DataFrame(md) return myDF
def fuzzy_match(s1, s2): ratio = fuzz.ratio(s1, s2) if ratio > 95: return True return False
def get_matching_node_st(query, relation_dict): """ :param query: search disease in ont :return: tupel, where first is the similarity score and second pos. is the dict entry """ cell_query = query.lower() clean_string = [] for word in cell_query.split(" "): e = ''.join(e for e in word if e.isalnum() or e != "-") if e != "": clean_string.append(e) cell_query = " ".join(clean_string) wnl = WordNetLemmatizer() tokens = [token.lower() for token in word_tokenize(cell_query)] lemmatized_words = [wnl.lemmatize(token) for token in tokens] cell_query = " ".join(lemmatized_words) best_match = [0, "", ""] tissue_in_ont = "" if cell_query in relation_dict: return [100, cell_query, ""] else: for disease, value in relation_dict.items(): # if the first char does not match continue try: if disease[0].lower() != cell_query[0].lower() or len( cell_query) < 3: partial = 0 else: partial = fuzz.ratio(disease, cell_query) except BaseException: partial = 0 for syn in value["hasRelatedSynonym"]: try: if syn[0].lower() != cell_query[0].lower() or len( cell_query) < 3: partial_related_syn = 0 else: partial_related_syn = fuzz.ratio(cell_query, syn) except BaseException: partial_related_syn = 0 # print("FuzzyWuzzy Ratio: ", fuzz.ratio(tissue, text), tissue) # print("FuzzyWuzzy Ratio_PARTIAL: ", partial, tissue) if best_match[0] < partial: if disease.lower()[0] == cell_query[ 0] and len(cell_query) * 2 > len(disease): best_match = [partial, disease, ""] if best_match[0] < partial_related_syn: if syn.lower()[0] == cell_query[0]: best_match = [partial_related_syn, disease, syn] for syn in value["hasExactSynonym"]: try: if syn[0].lower() != cell_query[0].lower() or len( cell_query) < 3: partial_related_syn = 0 else: partial_related_syn = fuzz.ratio(cell_query, syn) except BaseException: partial_related_syn = 0 # print("FuzzyWuzzy Ratio: ", fuzz.ratio(tissue, text), tissue) # print("FuzzyWuzzy Ratio_PARTIAL: ", partial, tissue) if best_match[0] < partial: if disease.lower()[0] == cell_query[ 0] and len(cell_query) * 2 > len(disease): best_match = [partial, disease, ""] if best_match[0] < partial_related_syn: if syn.lower()[0] == cell_query[0]: best_match = [partial_related_syn, disease, syn] if best_match[0] < partial: if disease.lower()[0] == cell_query[0]: best_match = [partial, disease, ""] return best_match
def process_call_2(path): a_orig = open(path, encoding="utf-8", errors='ignore').read() a_orig = a_orig.replace('–', '-') a_orig = re.split('\r|\n', a_orig) a = [aa.lower() for aa in a_orig] if 'no q&a session for this event' in ''.join(a): return None x = a.index('executives') # 找到人名清单 y = a.index('analysts') # 有没有可能存在没有记录的清单? z = a.index('operator') time = ''.join(a[:x]) anss = a[x + 1:y] + ['unidentified company representative'] anss_short = [re.sub(' - .+', '', anss_1) for anss_1 in anss] askk = a[y + 1:z] + ['unidentified analyst - unidentified company' ] + ['unidentified analyst'] askk_short = [re.sub(' - .+', '', ask) for ask in askk] opt = ['operator'] # print('ans', anss) # print('ask', askk) # 找到 QA 开始的部分 i = 0 while (fuzz.ratio(a[i], 'Question-and-Answer-Session'.lower()) < 80) or (fuzz.ratio(a[i], 'Question-&-nswer-Session'.lower()) < 80): i += 1 b = [x for x in a[i + 1:] if x != ''] b_orig = [x for x in a_orig[i + 1:] if x != ''] statu_list = [] idx_list = [] ques_count = 0 for b_idx, bb in enumerate(b): #b_idx = 0 ask_name_check = match_check(bb, askk) ans_name_check = match_check(bb, anss_short) opt_name_check = match_check(bb, opt) if ask_name_check == True: statu_list.append('q') idx_list.append(b_idx) elif ans_name_check == True: statu_list.append('a') idx_list.append(b_idx) elif opt_name_check == True: statu_list.append('o') idx_list.append(b_idx) if statu_list.count('a') * 2 < statu_list.count('q'): statu_list = [] idx_list = [] ques_count = 0 for b_idx, bb in enumerate(b): #b_idx = 0 ask_name_check = match_check(bb, askk) ans_name_check = match_check(bb, anss) opt_name_check = match_check(bb, opt) if ask_name_check == True: statu_list.append('q') idx_list.append(b_idx) elif ans_name_check == True: statu_list.append('a') idx_list.append(b_idx) elif opt_name_check == True: statu_list.append('o') idx_list.append(b_idx) if statu_list.count('q') * 2 < statu_list.count('a'): statu_list = [] idx_list = [] ques_count = 0 for b_idx, bb in enumerate(b): #b_idx = 0 ask_name_check = match_check(bb, askk_short) ans_name_check = match_check(bb, anss_short) opt_name_check = match_check(bb, opt) if ask_name_check == True: statu_list.append('q') idx_list.append(b_idx) elif ans_name_check == True: statu_list.append('a') idx_list.append(b_idx) elif opt_name_check == True: statu_list.append('o') idx_list.append(b_idx) if ('q' not in statu_list) and ('a' not in statu_list): statu_list = [] idx_list = [] ques_count = 0 for b_idx, bb in enumerate(b): #b_idx = 0 ask_name_check = match_check(bb, askk_short) ans_name_check = match_check(bb, anss) opt_name_check = match_check(bb, opt) if ask_name_check == True: statu_list.append('q') idx_list.append(b_idx) elif ans_name_check == True: statu_list.append('a') idx_list.append(b_idx) elif opt_name_check == True: statu_list.append('o') idx_list.append(b_idx) res_list = [] for idx in np.arange(len(statu_list)): # print(idx) if statu_list[idx] == 'q': #once get a question if (idx - 1 >= 0) and statu_list[idx - 1] == 'q': pass else: ques_count += 1 res_dict = dict() res_dict['question_' + str(ques_count)] = dict() res_dict['question_' + str(ques_count)]['ask_name'] = b[idx_list[idx]] #q_asker = b[idx_list[idx]] que_start_index = idx_list[idx] + 1 #que_end_index = idx_list[idx+1] idx_next_que = idx + 1 #print('q_idxnext', idx_next_que) while ((idx_next_que <= len(idx_list) - 1) & (statu_list[idx_next_que] == 'q')): res_dict['question_' + str(ques_count)]['ask_name'] += ', ' + b[ idx_list[idx_next_que]] idx_next_que += 1 que_end_index = idx_list[idx_next_que] res_dict['question_' + str(ques_count)]['question'] = b_orig[ que_start_index:que_end_index] q_text = b[que_start_index:que_end_index] #res_dict['question_'+str(ques_count)] = {“”} if statu_list[idx] == 'a': assert ques_count >= 1 if 'ans_name' in res_dict['question_' + str(ques_count)].keys(): #res_dict['question_'+str(ques_count)] ['ans_name'] += b[idx_list[idx]] pass else: res_dict['question_' + str(ques_count)]['ans_name'] = b[idx_list[idx]] ans_start_index = idx_list[idx] + 1 #ans_end_index = idx_list[idx+1] idx_next_ans = idx + 1 #print('a_idxnext', idx_next_ans) #print('test',statu_list[idx_next_ans]) while ((idx_next_ans <= len(idx_list) - 1) and (statu_list[idx_next_ans] == 'a')): res_dict['question_' + str(ques_count)]['ans_name'] += ', ' + b[ idx_list[idx_next_ans]] idx_next_ans += 1 if idx_next_ans > len(idx_list) - 1: ans_end_index = len(b) - 1 else: ans_end_index = idx_list[idx_next_ans] # print('question index',ans_start_index, ' ', ans_end_index) # print('question num', ques_count) res_dict['question_' + str(ques_count)]['answer'] = b_orig[ ans_start_index:ans_end_index] res_list.append(res_dict) return res_list
def fuzzy_ratio_similarity(str1, str2): return fuzz.ratio(str1, str2)