def _get_best_match(self, company_name, df): similar = difflib.get_close_matches(company_name, [name for name in df.company_name]) if len(similar) and process.extract(company_name, similar)[0][1] > 80: zoominfo_profile_name = process.extract(company_name, similar)[0][0] for i, zoominfo_profile in df.iterrows(): if zoominfo_profile['company_name'] == zoominfo_profile_name: return zoominfo_profile.to_dict() return "not found"
def findSongsInMyCollection(reference_file, matched_store, toCheck): ''' reference_file : the good-file prepared by dump_mp3_songs_info() matched_store : json file having already mapped results toCheck : json containing list of [{'TIT2','TALB'}] dicts returns (found/notfound) containing ({'TIT2','TALB'},path) path is a single result in found and a list(empty too) in not-found ''' with open(matched_store,'r') as fd: matched_store_info = json.load(fd) reference_info = load_good_file_info(reference_file) all_reference_titles = reference_info['TIT2'].keys() all_reference_albums = reference_info['TALB'].keys() found = [] notfound = [] count = 0 for track in toCheck: count += 1 title = track['TIT2'] album = track['TALB'] if album in matched_store_info: if title in matched_store_info[album]: print ("{}. {}, {} is already matched".format(count, title, album)) continue title_results = process.extract(title, all_reference_titles) albums_results = process.extract(album, all_reference_albums) title_results = [ i[0] for i in title_results if i[1] > 80 ] albums_results = [ i[0] for i in albums_results if i[1] > 80 ] all_paths = collections.defaultdict(int) for i in title_results: for path in reference_info['TIT2'][i]: all_paths[path] += 1 for i in albums_results: for path in reference_info['TALB'][i]: all_paths[path] += 1 if not all_paths: notfound.append((track, [])) continue max_occurance = max(all_paths.values()) max_keys = [ k for k in all_paths if all_paths[k] == max_occurance] if len(max_keys) == 1: print("{}. {},{} matched to {}".format(count, title, album, max_keys[0])) found.append((track,max_keys[0])) else: print("{}. {},{} didn't to {}".format(count, title, album, max_keys[0])) notfound.append((track, max_keys)) return found,notfound
def extract_matches(site_description_only, site_name_only2): choices1 = site_description_only choices2 = site_name_only2 #change the string to change the search criteria extractstring = "Javorski" print choices1 print choices2 print print fuzz_items1 = process.extract(extractstring, choices1, limit=2) fuzz_items2 = process.extract(extractstring, choices2, limit=1) print fuzz_items1[0] print fuzz_items2[0]
def nameVars(dat, name=None, numMatch=25): ''' Get variants on a name by Levenshtein distance. dat = name data set name = name string numMatch = number of variants to return ''' name = name[0].upper() + name[1:len(name)].lower() names = dat.Name.unique() ndf = process.extract(name, names, limit = numMatch) ndf = pd.DataFrame(data = ndf, columns = ['Name', 'Sim']) nsum = dat.groupby(['Name','Gender'])['Count'].sum() dat = df.sort_values(['Name', 'Count'], ascending = [1, 0]) dat = dat.groupby(['Name','Gender']).first() dat.drop('Id', axis=1, inplace=True) dat.columns = ['max_year', 'max_year_count'] dat['total_count'] = nsum dat.reset_index(inplace=True) ndf = pd.merge(ndf, dat, on = 'Name', how= 'inner') return ndf
def get(self, input_statement, statement_list, current_conversation=None): """ Takes a statement string and a list of statement strings. Returns the closest matching statement from the list. """ # Check if the list is empty if not statement_list: raise EmptyDatasetException # Get the text of each statement text_of_all_statements = [] for statement in statement_list: text_of_all_statements.append(statement.text) # Check if an exact match exists if input_statement.text in text_of_all_statements: return input_statement # Get the closest matching statement from the database closest_match = process.extract( input_statement.text, text_of_all_statements, limit=1 )[0][0] return next((s for s in statement_list if s.text == closest_match), None)
def fuzzy(self, tag, threshold=80): """Get a tuple of existing tags that fuzzily match a given one. Parameters ---------- tags : str or list Tag or tags to get fuzzy matches for. threshold : int Lowest match score to return. Setting to 0 will return every tag, while setting to 100 will return only exact matches. Returns ------- matches : tuple Tuple of tags that match. """ if isinstance(tag, string_types): tags = [tag] else: tags = tag matches = [] for tag in tags: matches += [i[0] for i in process.extract(tag, self, limit=None) if i[1] > threshold] return tuple(matches)
def get(self, input_statement, statement_list=None): """ Takes a statement string and a list of statement strings. Returns the closest matching statement from the list. """ statement_list = self.get_available_statements(statement_list) if not statement_list: if self.has_storage_context: # Use a randomly picked statement return 0, self.context.storage.get_random() else: raise EmptyDatasetException # Get the text of each statement text_of_all_statements = [] for statement in statement_list: text_of_all_statements.append(statement.text) # Check if an exact match exists if input_statement.text in text_of_all_statements: return 1, input_statement # Get the closest matching statement from the database closest_match, confidence = process.extract( input_statement.text, text_of_all_statements, limit=1 )[0] return confidence, next( (s for s in statement_list if s.text == closest_match), None )
def fuzzy_dropbox(project): directory_listing = os.listdir(dropbox_path) dir_temp = process.extract(project,directory_listing,limit=1) dropbox_output_directory = dir_temp[0] dropbox_output_directory = dropbox_output_directory[0] print 'This is the Dropbox directory name for that post share: ' + dropbox_output_directory return dropbox_output_directory
def fuzzy_project(project): directory_listing = os.listdir(server_directory) dir_temp = process.extract(project,directory_listing,limit=1) server_directory_exact = dir_temp[0] server_directory_exact = server_directory_exact[0] print 'This is the server directory name for that project: ' + server_directory_exact return server_directory_exact
def speech(self, message): responses = { "Hello": ["Hi there!", "Hi!", "Welcome!", "Hello, {name}!"], "Hi there": ["Hello!", "Hello, {name}!", "Hi!", "Welcome!"], "Hi!": ["Hi there!", "Hello, {name}!", "Welcome!", "Hello!"], "Welcome": ["Hi there!", "Hi!", "Hello!", "Hello, {name}!", ], "How are you?": ["I'm fine!", "Status: Working...", "I'm doing great."], "Good bye": ["Bye, {name}!"], "What time is it?": ["Adventure Time!", "{date} UTC"], } stickers = { "adventure_time": "BQADAgADeAcAAlOx9wOjY2jpAAHq9DUC", } leven = process.extract(message.get("text", ""), responses.keys(), limit=1)[0] if leven[1] < 75: self.send(text="I can not understand you") else: response = choice(responses[leven[0]]).format( name=message['chat'].get("first_name", ""), date=time.ctime(int(message.get("date"))) ) if response == "Adventure Time!": self.send(sticker=stickers['adventure_time']) else: self.send(text=response)
def findEligibleStops(self,searchTerm): possibleStops = [] # Number matching if len(searchTerm) <= 4 and searchTerm.isdigit(): stopID = "0" * (4 - len(searchTerm)) + str(searchTerm) if stopID in self.application.stopIDs: shortAddress = self.application.stopIDs[stopID]["Description"] possibleStops.append({"value":shortAddress,"id": stopID}) print("number exists") else: print("Number doesnt exist") else: # Fuzzy matching fuzzySearchResults = process.extract(searchTerm, self.application.humanAddressList,limit=7) for humanAddress in fuzzySearchResults: stopObject = self.application.humanAddressDictionary[humanAddress[0]] # Takes the first element (second is match value) stopID = stopObject["Stop"] shortAddress = self.application.stopIDs[stopID]["Description"] possibleStops.append({"value":shortAddress,"id": stopID}) if len(possibleStops) == 0: possibleStops.append({"value":"No stops found","id": "0000"}) return possibleStops
def main(): #get file names as inputs here later file = xlrd.open_workbook("WSnotof.xls") sheet = file.sheets()[0] WSnotof = sheet.col_values(2, 1) file = xlrd.open_workbook("Catalog.xlsx") sheet = file.sheets()[0] WScatalog = sheet.col_values(2, 1) new_dict = OrderedDict() #new_dict = dict() for ws in WSnotof: if not ws in new_dict: matchList = process.extract(ws, WScatalog, limit=3) new_dict[ws] = matchList file = Workbook() sheet = file.add_sheet('Map') row = 0 for short_WS, matchList in new_dict.items(): col = 1 sheet.write(row, 0, short_WS) for match in matchList: sheet.write(row, col, match[0]) col = col + 1 sheet.write(row, col, str(match[1])) col = col + 1 row = row + 1 file.save('final_choices.xls') print("Finished!")
def get_filter_link(link_choice,goal=None,min_score=None,max_limit=4,type=0): """ To get relevent link from list of link """ if min_score: min_score = int(min_score) else: min_score = 60 scored_link_list = [] scored_link_list_raw = process.extract(goal,link_choice,limit=max_limit) logger.info("Score details for goal {0} with statistics {1}. minimum score {2}".format(goal,scored_link_list_raw,min_score)) try: if scored_link_list_raw: for i in list(scored_link_list_raw): link = i[0] if int(type) != 1: score = i[1] if int(score) >= min_score: scored_link_list.append(link) logger.info("PARTIAL MATCH : Final score is {0} of url {1} for goal {2}".format(score,link,goal)) else: score = fuzz.token_set_ratio(goal,link) logger.info("EXACT MATCH : Final score is {0} of url {1} for goal {2}".format(score,link,goal)) if int(score) >= min_score: scored_link_list.append(link) except: logger.exception("Error occure in get_filter_link() function") return scored_link_list
def landmark_check(self): tmp = self.item.strip() # Name standardization: tmp_list = re.sub('[' + string.punctuation + ']', '', tmp).split() std = StandardName(tmp_list, False).output # Don't match on 'the' if first word try: tmp = ' '.join(std[1:]) if std[0].upper() in ('THE', 'TEH') else ' '.join(std) except: tmp = tmp.upper() # Fuzzy matching: try: first_letter = tmp[0] except: first_letter = '' landmark_dict = self.list_landmarks(first_letter) landmark_list = [x for x in landmark_dict.keys()] results = process.extract(tmp, landmark_list, limit=3) results = sorted(results, key=lambda r: r[1], reverse=True) try: results = [] if results[0][1] == results[1][1] else results lname = results[0][0] landmark_addresses = landmark_dict[lname] # Currently only handle uniquely named landmarks # landmark_address = landmark_addresses[0] if results[0][1] > 89 and len(landmark_addresses) == 1 else '' landmark_address = landmark_addresses[0] if results[0][1] > 89 else '' self.is_landmark = True if landmark_address else False self.landmark_address = landmark_address self.landmark_name = lname except: pass
def results(): if request.method == 'POST': qry = request.json["query"] print qry dbresults = [] dbresults = filestable.query.filter_by(ownerhostel = current_user.hostel).search(unicode(qry)).all() i = 0 print 'HELLO ' + str(dbresults) if len(dbresults) < 15: print "Inside" much = 15 - len(dbresults) dbresults += filestable.query.search(unicode(qry)).limit(much).all() print len(dbresults) if len(dbresults) > 0: print 'dbresults is : ' + str(type(dbresults)) print 'Type of dbresults[0] is : ' + str(type(dbresults[0])) dbresultsname = [] print 'AND NOW ' + str(dbresults[0].ownerhostel) for i in xrange(0, len(dbresults)): print str(type(dbresults[i])) dbresultsname.append(str(dbresults[i].name.replace("_"," "))) i = i + 1 fuzzyResults = process.extract(unicode(qry),dbresultsname,limit=5) print 'AND HI ' + str(fuzzyResults) else: fuzzyResults = "" print "Sorry No results" return jsonify(result = fuzzyResults) else: return redirect(url_for('search'))
def search(s_term='', s_users=False, s_groups=False, s_lists=False): # Get all user, group and list objects # Merge all user, group and list objects into one list all_search_obj = [] if s_users: all_users = object_manager.allUsers() all_search_obj.extend(all_users) if s_groups: all_groups = object_manager.allPublicGroups() all_search_obj.extend(all_groups) if s_lists: all_lists = object_manager.getAllExerciseLists() all_search_obj.extend(all_lists) # Make a dict of the object with its seachString all_search = {obj: obj.searchString() for obj in all_search_obj} if s_term: # Fuzzy search results = process.extract(s_term, all_search, limit=10) # Search results have to have at least a 50% match filtered = [r[2] for r in reversed(sorted(results, key=lambda e: e[1])) if r[1] >= 80] return filtered else: return sorted(all_search_obj, key=lambda e: e.name())
def fuzzy_post_share(project,server_path): directory_listing = os.listdir(server_path) dir_temp = process.extract(project,directory_listing,limit=1) post_share_exact = dir_temp[0] post_share_exact = post_share_exact[0] print 'This is the server post share directory name for that project: ' + post_share_exact return post_share_exact
def fuzzy(project): direct_list = os.listdir(dropbox_path) dir_temp = process.extract(project,direct_list,limit=1) directory = dir_temp[0] directory = directory[0] print directory return directory
def matchHeard(heard, results, lookingFor='label'): located = None heard_minus_the = remove_the(heard) print 'Trying to match: ' + heard sys.stdout.flush() heard_list = set([x for x in heard.split() if x not in STOPWORDS]) for result in results: # Strip out non-ascii symbols and lowercase it ascii_name = result[lookingFor].encode('ascii', 'replace') result_name = str(ascii_name).lower().translate(None, string.punctuation) # Direct comparison if heard == result_name: print 'Simple match on direct comparison' located = result break # Remove 'the' if remove_the(result_name) == heard_minus_the: print 'Simple match minus "the"' located = result break # Remove parentheses removed_paren = re.sub(r'\([^)]*\)', '', ascii_name).rstrip().lower().translate(None, string.punctuation) if heard == removed_paren: print 'Simple match minus parentheses' located = result break if not located: print 'Simple match failed, trying fuzzy match...' sys.stdout.flush() fuzzy_result = process.extract(str(heard), [d[lookingFor] for d in results], limit=1, scorer=fuzz.QRatio) if fuzzy_result[0][1] > 75: print 'Fuzzy match %s%%' % (fuzzy_result[0][1]) located = (item for item in results if item[lookingFor] == fuzzy_result[0][0]).next() else: heard = replaceDigits(heard) fuzzy_result = process.extract(str(heard), [d[lookingFor] for d in results], limit=1, scorer=fuzz.QRatio) if fuzzy_result[0][1] > 75: print 'Fuzzy match %s%%' % (fuzzy_result[0][1]) located = (item for item in results if item[lookingFor] == fuzzy_result[0][0]).next() return located
def give(user, amount, reason): app = _application_init() fullname = lambda u: " ".join([u[x] for x in ('first_name', 'last_name')]) found = process.extract(user, app.settings.user_cache, processor=fullname) top= found[0][0] email = top['email'] fullname = "%s %s" % (top['first_name'], top['last_name']) click.echo("Giving %s %s, is this correct?" % (fullname, amount))
def busca(self, query): query = normaliza_string(query) amostra = self.cria_amostra() scorer = seleciona_scorer(query) resultado = process.extractBests(query, amostra, limit=10, scorer=scorer, score_cutoff=65) if scorer == fuzz.token_set_ratio: resultado = process.extract(query, lapida_extracao(resultado), limit=20, scorer=fuzz.partial_ratio) return lapida_extracao(resultado)
def closest_locstrings(self, query, threshold=90, max_results=5): # list everything matching location with open(self.locstrings_filename) as csvfile: r = csv.reader(csvfile) result = process.extract(query, r, processor=lambda x: x[0], limit=max_results) return [(x[0][1], x[1]) for x in result if x[1] > threshold]
def match_functions_argument_name(self): """ This function populates self.matched_functions_name with tuples of (stub_functions, match_ratio) ordered by match_ratio :return: None """ self.matched_functions_name = \ process.extract(self.task_comment, self.functions.keys(), limit=len(self.functions.keys()))
def work_on_entry(final, toLook, refList, refMainDict): if toLook: results = process.extract(toLook,refList) results = [ i[0] for i in results if i[1] > 70 ] for i in results: for path in refMainDict[i]: final[path] += 1 return None
def main(): switch = 1 while(switch == 1): usercommand = input() choices = ["wash dishes", "making bed", "taking out trash", "vacuuming", "cooking food", "doing laundry", "dusting", "lawn mowing"] print(process.extract(usercommand, choices, limit=8)) results = process.extract(usercommand, choices, limit=8) match = results[0] print("\n") print("Your input matches this command:") print(match[0]) print("\n" + "Exit program? (y / n)") userexit = raw_input() if userexit == "y": switch = 0 return
def get_closest_possible_matches(input_arg): VALID_MATCH_PERCENTAGE = 60 MATCH_PERCENTAGE_INDEX = 1 MATCH_FUNC_INDEX = 0 available_matches = dispatch_func.keys() possible_matches = process.extract(input_arg, available_matches) indexed_matches_dict = {index + 1: entry[MATCH_FUNC_INDEX] for index, entry in enumerate(possible_matches) if entry[MATCH_PERCENTAGE_INDEX] > VALID_MATCH_PERCENTAGE} return indexed_matches_dict
def items(self, name): """ Bla bla bla, get all items """ data = process.extract( name, self._items.keys() ) return data
def find_movie(self, query): """ Find the movie in the library that best matches the given query. :param query: Query to search for in the library :return: The name of the movie whose title best matches the given query :rtype: str """ results = process.extract(query, self.list_movies(), limit=1) return results[0][0]
def get_instructors(search_string): instructor_names = [instructor.name for instructor in Instructor.query.all()] instructors = [ instructor for instructor in process.extract(search_string, instructor_names, limit=100) if instructor[1] > 60 ] instructor_data = [ get_less_instructor_json(Instructor.query.filter_by(name=instructor[0]).first()) for instructor in instructors ] return json_response({"status": "success", "data": instructor_data}, 200)
def username_lookup(username): if not wf.cached_data_fresh('usernameList', max_age=UPDATE_INTERVAL) or wf.cached_data('hostIdList') is None: main.update_caches() match_found = localCache.get(str(username)) if match_found < 1: results = process.extract(username, localCache.keys(), limit=3) fuzzy_match(results, username) else: exact_match(match_found, username)
#print(delimitedlocation[authors][location]) countrytext = delimitedlocation[authors][location] if countrytext == 'NA': countrydata.append('NA') citystatedata.append('NA') break if '(United States)' in countrytext: country = 'United States' citystate = countrytext.replace('(United States)', '') citystate = citystate.replace(',', '') elif '(France)' in countrytext: country = 'France' citystate = countrytext.replace('(France)', '') citystate = citystate.replace(',', '') else: country1, score, index = process.extract( delimitedlocation[authors][location], countrylist, limit=1)[0] if country1 in delimitedlocation[authors][location]: country = country1 else: country = 'No Country' citystate = countrytext.replace(country1, '') countrycap = country1.upper() citystate = citystate.replace(countrycap, '') citystate = citystate.replace(',', '') citystate = citystate.replace('-', '') citystate = citystate.lower() citystate = citystate.title() #data = process.extract(delimitedlocation[authors][location], combinedcitystatelist, limit=5) #citystates = [] #for i in range(5): # citystates.append(data[i][0])
ROM 2 """ fileName = "stringMatch" df = pd.read_csv("./data/" + fileName + ".txt", sep='\t', header=0) df["strlen"] = df["str"].str.len() df = df[df["strlen"] > 1] dftopList = df[ df["counts"] > 4] # If you don't have a count of the words then you can comment this line dftopListWords = dftopList["str"].tolist() words = df["str"].tolist() i = 0 matchAnalysis = [] for word in dftopListWords: for matchWord, matchval in process.extract(word, words, limit=20): if word != matchWord: levnDist = jellyfish.levenshtein_distance(word, matchWord) seqMatch = round( SequenceMatcher(None, word, matchWord).ratio() * 100) # fuzz.token_sort_ratio matchAnalysis.append( [word, matchWord, matchval, levnDist, seqMatch, len(word)]) # if i == 2: # break # else: # i += 1 else: dfMatch = pd.DataFrame.from_records(matchAnalysis,
import os os.environ.setdefault("DJANGO_SETTINGS_MODULE", "geonode.settings") # start of fuzzy look up from geodb.models import AfgPplp, Glofasintegrated from fuzzywuzzy import process import datetime # f = AfgPplp.objects.all().filter(dist_na_en='Qala-e-Naw').values('name_en','dist_na_en','prov_na_en') f = AfgPplp.objects.all().values('name_en', 'dist_na_en', 'prov_na_en') choices = [] for i in f: choices.append(i['name_en'].lstrip() + ';' + i['dist_na_en'] + ';' + i['prov_na_en']) x = process.extract("BAGHBAN HA;Qala-I- Naw;Badghis", choices) t = 1 for i in x: print t, i t = t + 1 # end of fuzzy look up
def fetch_youtube_results(searchterm): # searchterm = request.args.get('searchterm') matches = [a for a, b in process.extract(searchterm, yt['title'].tolist())] results = yt.loc[yt['title'].isin(matches)] return results.reset_index(drop=True).T.to_dict()
def get_matches(query, choices, limit = 1): results = process.extract(query, choices, limit=limit) return results
def _email_search(self, email, api_key=""): try: person = clearbit.Person.find(email=email, stream=True) except: person = None data = { "pattern": None, "name": None, "email": email, "domain": email.split("@")[-1], "crawl_source": "email_hunter" } if person: pattern = EmailGuessHelper()._find_email_pattern( person["name"]["fullName"], email) if pattern: data = { "pattern": pattern, "name": person["name"]["fullName"], "email": email, "domain": email.split("@")[-1], "crawl_source": "email_hunter" } elif not person or not pattern: person = FullContact()._person_from_email(email) print person try: person = person["contactInfo"]["fullName"] fullcontact_person = True except: fullcontact_person = False if fullcontact_person: person = person["contactInfo"]["fullName"] pattern = EmailGuessHelper()._find_email_pattern(person, email) data = { "pattern": pattern, "name": person, "email": email, "domain": email.split("@")[-1], "crawl_source": "email_hunter" } print pattern else: _email = email.replace(".", " ").replace("-", " ").replace("_", " ") _email = _email.replace("@", " ") g = Google().search("{0} site:linkedin.com/pub".format(_email)) g1 = Google().search("{0} site:linkedin.com/pub".format( _email.split(" "[0]))) g2 = Google().search( "{0} site:linkedin.com/pub".format(_email).split(" ")[-1]) g = pd.concat([g, g1, g2]) choices = [i.split(" |")[0] for i in g.link_text] person = process.extract(_email, choices, limit=1) try: person = person[0][0] except: ''' ''' pattern = EmailGuessHelper()._find_email_pattern(person, email) print "google search pattern", pattern if pattern: data = { "pattern": pattern, "name": person, "email": email, "domain": email.split("@")[-1], "crawl_source": "email_hunter" } else: data = { "pattern": None, "name": None, "email": email, "domain": email.split("@")[-1], "crawl_source": "email_hunter" } #data = pd.DataFrame([data]) conn = r.connect(host="localhost", port=28015, db="triggeriq") r.table('email_pattern_crawls').insert(data).run(conn) #CompanyEmailPatternCrawl()._persist(data, "emailhunter", api_key) # persist to rethinkdb print "person", person
def movie_bot_final(title): form = SearchForm(request.form) # IDENTIFY THE TITLE THAT WAS PASSED IN titleloc = movies.loc[movies['tmdbId'] == int(title)] movieTitle = titleloc['title'].iloc[0] # GET THE DESCRIPTION OF THE MOVIE THAT WAS PASSED IN tmdb_desc = requests.get(f'https://api.themoviedb.org/3/movie/{title}?api_key={api_key}') desc_data = tmdb_desc.json() if desc_data.get("overview") != None: description = desc_data['overview'] else: pass # GET THE YOUTUBE TRAILER LINK FOR THE ID THAT WAS PASSED IN tmdb_trailer = requests.get(f'https://api.themoviedb.org/3/movie/{title}/videos?api_key={api_key}') trailer_response = tmdb_trailer.json()['results'] if not trailer_response: trailer_url = 'None' else: trailer_data = trailer_response[0].get('key') trailer_path = trailer_data trailer_url = (f'https://www.youtube.com/watch?v={trailer_path}') # FORM SUBMISSION if request.method == 'POST': form_cont = form.autocomp.data str2Match = form_cont strOptions = movie_list Ratios = process.extract(str2Match,strOptions) highest = process.extractOne(str2Match,strOptions) fuzzyresult = highest[0] movie_index = movies.loc[movies['title'] == fuzzyresult] movieID = str(movie_index['tmdbId'].iloc[0]) # IF THE STRING IS AN EXACT MATCH, THERE IS NO NEED TO GO TO THE SEARCH PAGE. IF INPUT IS NOT GREATER THAN 1, DO NOTHING. if form_cont == fuzzyresult: return redirect('../rec/' + movieID) elif len(form_cont) > 1: return redirect('../results/' + form_cont) else: pass titles = movies['title'] indices = pd.Series(movies.index, index=movies['title']) idx = indices[movieTitle] # ----------------------------- # ML BASED ON THE MOVIE GENRE # ----------------------------- genre_sim_scores = list(enumerate(genre_cosine_sim[idx])) genre_sim_scores = sorted(genre_sim_scores, key=lambda x: x[1], reverse=True) genre_sim_scores = genre_sim_scores[1:21] genre_movie_indices = [i[0] for i in genre_sim_scores] # RETURNS THE 12 MOST SIMILAR MOVIES BY GENRE genre_df = titles.iloc[genre_movie_indices].head(13).to_frame() # ---------------------------- # ML BASED ON THE MOVIE CAST # ---------------------------- cast_sim_scores = list(enumerate(cast_cosine_sim[idx])) cast_sim_scores = sorted(cast_sim_scores, key=lambda x: x[1], reverse=True) cast_sim_scores = cast_sim_scores[1:21] cast_movie_indices = [i[0] for i in cast_sim_scores] # RETURNS THE 12 MOST SIMILAR MOVIES BY CAST cast_df = titles.iloc[cast_movie_indices].head(13).to_frame() # ----------------------------------- # ML BASED ON THE MOVIE DESCRIPTION # ----------------------------------- desc_sim_scores = list(enumerate(desc_cosine_sim[idx])) desc_sim_scores = sorted(desc_sim_scores, key=lambda x: x[1], reverse=True) desc_sim_scores = desc_sim_scores[1:21] desc_movie_indices = [i[0] for i in desc_sim_scores] # RETURNS THE 12 MOST SIMILAR MOVIES BY DESCRIPTION desc_df = titles.iloc[desc_movie_indices].head(13).to_frame() # ------------------------------------------------------------------ # REMOVING SEARCH TITLE FROM RESULTS AND RETURNING 12 RECS # ------------------------------------------------------------------ genre_df = genre_df[genre_df.title != movieTitle] genre_df = genre_df.head(12) cast_df = cast_df[cast_df.title != movieTitle] cast_df = cast_df.head(12) desc_df = desc_df[desc_df.title != movieTitle] desc_df = desc_df.head(12) # ------------------------------------------------------------------ # PROSESSING RESULTS AND CREATING ONE LARGE DATAFRAME # ------------------------------------------------------------------ mv = pd.concat([genre_df,cast_df,desc_df]).reset_index(drop=True) cols = ['title'] temp_df = mv.join(movies.set_index(cols), on=cols) # GETTING MOVIE INFORMATION moviename = [] url1 = [] movCastin = titleloc['cast'].iloc[0] movCastOut = movCastin.replace("'","").strip("][").split(', ') topCast = [] for x in range(3): topCast.append(movCastOut[x]) # PULLS THE IMAGE URL FROM THE MOVIES DF AND APPENDS THEM TO THE URL PREFIX FOR THE MOVIE POSTERS # PASSES THE MOVIE POSTER URL INTO THE RECS.HTML PAGE titleurl = str("https://image.tmdb.org/t/p/original/" + titleloc['poster_path'].iloc[0]) backdropPath = str(desc_data['backdrop_path']) bgurl = ("https://image.tmdb.org/t/p/original/" + backdropPath) runtime = str(desc_data['runtime']) for film in temp_df.tmdbId: moviename.append(film) for poster in temp_df.poster_path: url1.append("http://image.tmdb.org/t/p/w185" + str(poster)) return render_template('recs.html', moviename=moviename, url1=url1, topCast=topCast, movieTitle=movieTitle, titleurl=titleurl, bgurl=bgurl, form=form, description=description, runtime=runtime, trailer_url=trailer_url)
def golden_source_merge(df_list, key, threshold=80, limit=1): """ """ # create null match columns matching_dict = {} df_1 = df_list.pop(0) df_1 = df_1[key] # drop all other columns df_1.drop_duplicates(subset=key, inplace=True) # drop duplicates for df_2 in df_list: df_2 = df_2[key] # drop all other columns df_1['match_key'] = '' df_2['match_key'] = '' df_2.drop_duplicates(subset=key, inplace=True) # drop duplicates # combines the list of column inputs into a single string for matching for value in key: df_1['match_key'] = df_1['match_key'].map( str) + ' ' + df_1[value].map(str) for value in key: df_2['match_key'] = df_2['match_key'].map( str) + ' ' + df_2[value].map(str) # remove periods for abreviated names df_1['match_key'] = df_1['match_key'].map(lambda x: x.strip(".,!")) df_2['match_key'] = df_2['match_key'].map(lambda x: x.strip(".,!")) # applies lower case and removes common words like "college" and "the" df_1['match_key'] = df_1['match_key'].apply(format_match_string) df_2['match_key'] = df_2['match_key'].apply(format_match_string) # the match process-creates the match keys to a list, matches, then saves them in the match column r = df_1['match_key'].tolist() s = df_2['match_key'].tolist() m = df_1['match_key'].apply(lambda x: process.extract( x, s, limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio)) df_1['match'] = m df_2_matches = df_2['match_key'].apply(lambda x: process.extract(x, r, limit=limit, scorer=fuzzywuzzy.fuzz.token_sort_ratio))\ .apply(lambda x: [i[1] for i in x if i[1] < threshold])\ .apply(lambda x: 1 if x else 0) # 0 if empty list df_2 = df_2.merge(df_2_matches.rename('not_matched'), left_index=True, right_index=True) matching_dict.update( return_matching_dictionary(df_2, df_1, key, threshold)) df_2 = df_2.loc[df_2['not_matched'] == 1] # drop the score value and only keep the match words m2 = df_1['match'].apply( lambda x: ', '.join([i[0] for i in x if i[1] >= threshold])) df_1['match'] = m2 # merge based on the matches, suffixes to drop the columns later temp_df = df_1.merge(df_2, left_on='match', right_on='match_key', suffixes=['', '_y']) # add back in df1 values that were dropped df_1 = pd.concat([df_1, temp_df]).drop_duplicates(key) # add in df2 values that weren't matched df_1 = pd.concat([df_1, df_2]).drop_duplicates(key) # drop the matching name columns since this is a left join df_1 = df_1[df_1.columns.drop(list(df_1.filter(regex='_y')))] df_1 = df_1[key] df_1.reset_index(drop=True, inplace=True) return df_1, matching_dict
def get_matches(query,choices): results = process.extract(query,choices) return results
def get_best_plugin_name_match(self, plugin_name): choices = [p.replace('_', ' ') for p in self.bot.get_plugins_names()] plugin_name = plugin_name.replace('_', ' ') result = process.extract(plugin_name, choices, scorer=fuzz.token_sort_ratio) result = [(r[0].replace(' ', '_'), r[1]) for r in result] return result[0][0] if result[0][1] > 65 else None
# Append deduplicated census_b to census_a full_census = census_a.append(unique_b) # end timer end = time.time() ### fuzzywuzzy # Minimum Edit Distance (MED) is the least possible amount of steps needed to transition from one string to another. MED is calculated using only 4 operations, Insertion, Deletion, Substitution, Replacing consecutive characters from fuzzywuzzy import fuzz # The output returns a percentage between 0 and 100, 0 being not similar at all and 100 being identical: fuzz.WRatio('Python', 'Cython') # there are 4 other functions to compute string similarity: fuzz.ratio fuzz.partial_ratio fuzz.token_sort_ratio fuzz.token_set_ratio # Extract Best Matches to a String from a List of Options from fuzzywuzzy import process string_to_match = 'Mercedez-Benz' options = ['Ford', 'Mustang', 'mersedez benz', 'MAZDA', 'Mercedez'] process.extract( query=string_to_match, choices=options, limit=3) # can adjust scoring methods by setting scorer=fuzz.ratio
from fuzzywuzzy import process cities = ['Karachi', 'Lahore', 'Islamabad', 'Rawalpindi', 'Quetta', 'Peshawar', 'Gawadar', 'Multan', 'Hyderabad', 'Faisalabad' , 'Gujranwala', 'Rahim Yar Khan'] # limit = 3 result = process.extract('hk', cities, limit=3) print(result)
file_glossary = "glossary.txt" file_compare=sys.argv[1] comparision_results={} # For now we just do fuzzywuzzy compare and report highest score for each word in file_compare with open(file_glossary) as filedata_glossary: lines_glossary = [line.rstrip('.md\n') for line in filedata_glossary] with open(file_compare) as filedata_compare: lines_compare = [line.rstrip('\n') for line in filedata_compare] for item_in_compare in lines_compare: comparision_results = process.extract(item_in_compare, lines_glossary) print(comparision_results) # Workflow: turn document into a list of words and word count (data.csv) # Compare to glossary, results.csv with word, count of word in document, then top 3 matches? # have a cut off percentage? # # How to handle: # 1) exact matches (100) of primary file and long term of alias # 2) really close matches (perentage? additional string compares? do a simple poor mans test as well?) of primary file and long term of alias # 3) stuff with no match of primary file and long term of alias? # 4) how to handle the stop list, we'll need to put that data in as well # # We'll generate a CSV output file with the file_compare entries and then:
names=['c', 'n'], index_col='c', ) ## Construct dictionary for country/region names c_names = df_results.to_dict( )['n'] #http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html c_names_inv = {v: k for k, v in c_names.items()} ## Country names fuzzy match from fuzzywuzzy import process choice = [] for i, c_name_Web in enumerate(list_country_names_Web): #found_candidates = [x for x in c_names_inv.keys() if fuzzy_match(x,c_name_Web)==True] found_candidate = process.extract(c_name_Web, c_names_inv.keys(), limit=1) found_candidate_c = c_names_inv[found_candidate[0][0]] choice_item = [i, c_name_Web, found_candidate, found_candidate_c] #print (choice_item) choice.append(choice_item) import ast done = False while not (done): try: # Note: Python 2.x users should use raw_input, the equivalent of 3.x's input prn = [repr(x) for x in choice] print("\n\r".join(prn)) i = int(input("Please enter your corrections: Serial no (-1:None): ")) if i == -1: print("Done!")
async def newquotes(ctx, phrase, source): titre = "Recherche en cours veuillez patienter" msg = await ctx.send(titre) if len(source) >= 5000: list_s = split_dict_to_multiple(source, round(len(source) / 10)) else: list_s = [source] results = {} i = 1 full = "█" empty = "░" for x in list_s: Ratios = process.extract(phrase, list(x)) print(f"{i}/{len(list_s)}") if i % 2 == 0: await msg.edit( content=f"{titre}\n{full*i*2}{empty*((len(list_s)-i)*2)}") for r in Ratios: if r[1] < 87: break results[r[0]] = r[1] i += 1 results = list( dict(sorted(results.items(), key=lambda item: item[1], reverse=True))) print(results) answer = "" if len(results) != 1: txt = "**__LISTE DES QUOTES TROUVEE__**\n" emoji = [ "1️⃣", "2️⃣", "3️⃣", "4️⃣", "5️⃣", "6️⃣", "7️⃣", "8️⃣", "9️⃣", "🔟" ] used = {} i = 0 for x in results: txt += f"{emoji[i]} {x} -> {source[x]['title']}, à {source[x]['time']}s\n" # ({dict[x]['ep']}&t={dict[x]['time']}) used[emoji[i]] = i i += 1 await msg.edit(content=f"{txt}") for x in used: await msg.add_reaction(x) def check(reaction, user): return user == ctx.author and str(reaction.emoji) in used try: reaction, user = await bot.wait_for('reaction_add', timeout=40, check=check) except asyncio.TimeoutError: await msg.edit(content=f"Temps écoulé pour \"{phrase}\"") try: for x in used: await msg.clear_reaction(x) except: print("pas les bons role") return answer = results[used[reaction.emoji]] else: answer = results[0] await msg.edit(content=f"{source[answer]['ep']}&t={source[answer]['time']}" ) try: for x in used: await msg.clear_reaction(x) except: print("pas les bons role")
def get_phone_name(self, phone_name): return process.extract(phone_name, self.phone_list, limit=1)[0]
def handleQuery(query) -> list: """Hook that is called by albert with *every new keypress*.""" # noqa results = [] if query.isTriggered: try: # be backwards compatible with v0.2 if "disableSort" in dir(query): query.disableSort() results_setup = setup(query) if results_setup: return results_setup query_str = query.string # new behavior tokens = query_str.split() if len(tokens) >= 1 and tokens[0] == "new": if len(tokens) > 1: name = tokens[1] else: name = "" if len(tokens) > 2: desc = " ".join(tokens[2:]) else: desc = "" results.append( v0.Item( id=__prettyname__, icon=icon_path, text=f"New abbreviation: {name}", subtext=f"Description: {desc}", actions=[ v0.FuncAction( f"Save abbreviation to file", lambda name=name, desc=desc: save_abbr( name, desc), ) ], )) return results curr_hash = hash_file(abbreviations_path) global abbr_latest_hash, abbr_latest_d, abbr_latest_d_bi if abbr_latest_hash != curr_hash: abbr_latest_hash = curr_hash with open(abbreviations_path) as f: conts = f.readlines() abbr_latest_d = make_latest_dict(conts) abbr_latest_d_bi = abbr_latest_d.copy() abbr_latest_d_bi.update( {v: k for k, v in abbr_latest_d.items()}) if not abbr_latest_d: results.append( v0.Item( id=__prettyname__, icon=icon_path, text= f'No lines split by "{split_at}" in the file provided', actions=[ v0.ClipAction( f"Copy provided filename", str(abbreviations_path), ) ], )) return results # do fuzzy search on both the abbreviations and their description matched = process.extract(query_str, abbr_latest_d_bi.keys(), limit=10) for m in [elem[0] for elem in matched]: if m in abbr_latest_d.keys(): results.append(get_abbr_as_item((m, abbr_latest_d[m]))) else: results.append(get_abbr_as_item((abbr_latest_d_bi[m], m))) except Exception: # user to report error if dev_mode: # let exceptions fly! print(traceback.format_exc()) raise results.insert( 0, v0.Item( id=__prettyname__, icon=icon_path, text= "Something went wrong! Press [ENTER] to copy error and report it", actions=[ v0.ClipAction( f"Copy error - report it to {__homepage__[8:]}", f"{traceback.format_exc()}", ) ], ), ) return results
def create_entity_node_relationships(df, entity_name, global_id_counter, levenshtein_thresh=None): raw_entity_df = get_entity_df(df, entity_name.upper()) entity_counts = raw_entity_df.name.str.lower().value_counts() if "" in entity_counts: del entity_counts[""] if levenshtein_thresh: most_common = entity_counts[:levenshtein_thresh] most_common_set = set(most_common.index) raw_to_resolved = {} resolved_entity_counts = defaultdict(int) for name in most_common.index: resolved_entity_counts[name] += most_common[name] for name in entity_counts.index: if name in most_common_set: continue (candidate_name, candidate_score), = process.extract(name, most_common_set, limit=1, scorer=fuzz.ratio) if candidate_score > 89: raw_to_resolved[name] = candidate_name resolved_entity_counts[candidate_name] += entity_counts[name] else: resolved_entity_counts[name] += entity_counts[name] else: raw_to_resolved = {} resolved_entity_counts = entity_counts resolved_entity_count_df = pd.DataFrame( {"mentions": resolved_entity_counts}) resolved_entity_count_df["id"] = [ str(next(global_id_counter)) for _ in range(len(resolved_entity_count_df)) ] resolved_entity_count_df["name"] = resolved_entity_count_df.index resolved_entity_count_df = resolved_entity_count_df.set_index("id", drop=False) entity_df_n4j = pd.DataFrame({ "entity{entity}Id:ID".format(entity=entity_name.capitalize()): resolved_entity_count_df.id, "name": resolved_entity_count_df.name, "mentions:int": resolved_entity_count_df.mentions, ":LABEL": "Entity_{entity}".format(entity=entity_name.capitalize()) }) save_node = "neo4j-csv/entity_{entity}.csv".format(entity=entity_name) entity_df_n4j.to_csv(save_node, index=False) raw_entity_df = raw_entity_df.drop_duplicates() raw_entity_df["name_format"] = raw_entity_df.name raw_entity_df["name_lower"] = raw_entity_df.name.str.lower() raw_entity_df["name"] = raw_entity_df.name.str.lower().apply( lambda n: raw_to_resolved[n] if n in raw_to_resolved else n) relationship_df = pd.merge(resolved_entity_count_df, raw_entity_df, on='name') mentions_n4j = pd.DataFrame({ ":START_ID": relationship_df.emailId, ":END_ID": relationship_df.id, "as": relationship_df.name_format, ":TYPE": "MENTION" }) save_relationship = "neo4j-csv/mentions_{entity}.csv".format( entity=entity_name) mentions_n4j.to_csv(save_relationship, index=False)
def search(list_to_search: list, value, key, cutoff=5, return_key=False, strict=False): """Fuzzy searches a list for an object result can be either an object or list of objects :param list_to_search: The list to search. :param value: The value to search for. :param key: A function defining what to search for. :param cutoff: The scorer cutoff value for fuzzy searching. :param return_key: Whether to return the key of the object that matched or the object itself. :param strict: If True, will only search for exact matches. :returns: A two-tuple (result, strict)""" # there is nothing to search if len(list_to_search) == 0: return [], False # full match, return result exact_matches = [ a for a in list_to_search if value.lower() == key(a).lower() ] if not (exact_matches or strict): partial_matches = [ a for a in list_to_search if value.lower() in key(a).lower() ] if len(partial_matches) > 1 or not partial_matches: names = [key(d).lower() for d in list_to_search] fuzzy_map = {key(d).lower(): d for d in list_to_search} fuzzy_results = [ r for r in process.extract( value.lower(), names, scorer=fuzz.ratio) if r[1] >= cutoff ] fuzzy_sum = sum(r[1] for r in fuzzy_results) fuzzy_matches_and_confidences = [ (fuzzy_map[r[0]], r[1] / fuzzy_sum) for r in fuzzy_results ] # display the results in order of confidence weighted_results = [] weighted_results.extend( (match, confidence) for match, confidence in fuzzy_matches_and_confidences) weighted_results.extend((match, len(value) / len(key(match))) for match in partial_matches) sorted_weighted = sorted(weighted_results, key=lambda e: e[1], reverse=True) # build results list, unique results = [] for r in sorted_weighted: if r[0] not in results: results.append(r[0]) else: results = partial_matches else: results = exact_matches if len(results) > 1: if return_key: return [key(r) for r in results], False else: return results, False elif not results: return [], False else: if return_key: return key(results[0]), True else: return results[0], True
def scrape(): browser = init_browser() receivingURL = 'https://nextgenstats.nfl.com/stats/receiving#yards' browser.visit(receivingURL) html = browser.html soup = bs(html, 'html.parser') receivingtable=pd.read_html(str(soup.find_all('table'))) recdf= pd.DataFrame(receivingtable[1]) reccolumnlist = receivingtable[0].values.tolist()[0] reccolumnlist.pop() recdf.columns = reccolumnlist recdf = recdf.rename(index=str, columns={"+/-Avg .YAC Above Expectation":"+/-Avg YAC Above Expectation"}) recdf_dict = recdf.to_dict(orient='records') rushingURL = 'https://nextgenstats.nfl.com/stats/rushing#yards' browser.visit(rushingURL) html = browser.html soup = bs(html, 'html.parser') rushingtable=pd.read_html(str(soup.find_all('table'))) rushdf= pd.DataFrame(rushingtable[1]) rushcolumnlist = rushingtable[0].values.tolist()[0] rushcolumnlist.pop() rushdf.columns = rushcolumnlist rushdf_dict = rushdf.to_dict(orient='records') passingURL = 'https://nextgenstats.nfl.com/stats/passing#yards' browser.visit(passingURL) html = browser.html soup = bs(html, 'html.parser') passingtable=pd.read_html(str(soup.find_all('table'))) passdf= pd.DataFrame(passingtable[1]) passcolumnlist = passingtable[0].values.tolist()[0] passcolumnlist.pop() passdf.columns = passcolumnlist passdf_dict = passdf.to_dict(orient='records') # List of each week number weeks = list(range(1,18)) # Api Url base_url = "http://api.fantasy.nfl.com/v1/players/stats?statType=weekStats&season=2018&week={}&form=json" temp_final_df = pd.DataFrame() for week in weeks: target_url = base_url.format(week) temp = requests.get(target_url).json()['players'] temp_df = pd.DataFrame(temp) temp_df = temp_df.drop(columns = 'stats') temp_df['week'] = week temp_final_df = temp_final_df.append(temp_df) team_names = temp_final_df.teamAbbr.unique() temp_dict = temp_final_df.to_dict(orient='records') data = pd.read_csv(r'C:\\Users\\rirvi\Documents\\NFLETL\\2018_Schedule_City.csv') data['away_abrev'] = data['Away'].str[0:4] data['home_abrev'] = data['Home'].str[0:4] away_abrev = [] for awy in data['away_abrev']: away_abrev.append(process.extract(awy,team_names)[0][0]) home_abrev = [] for hme in data['home_abrev']: home_abrev.append(process.extract(hme,team_names)[0][0]) data['away_abrev'] = away_abrev data['home_abrev'] = home_abrev schedule_dict = data.to_dict(orient='records') return passdf_dict, rushdf_dict, recdf_dict, temp_dict, schedule_dict
def albertson_main(file, pages): temp_directory = tempfile.TemporaryDirectory(dir=document_location) input_pdf_location = f'{temp_directory.name}/input_pdf.pdf' final_dict = {} overall_content_list = [] input_pdf = get_smb_or_local(file, input_pdf_location) pages = pages pdfplumber_pdf = pdfplumber.open(input_pdf) for page in pages.split(','): print(f'{page}') if int(page) - 1 in range(len(pdfplumber_pdf.pages)): page_dict = {} tables = camelot.read_pdf(input_pdf, pages=page, flavor='stream', row_tol=12, edge_tol=500) no_of_tables = len(tables) chunked_df = {} for table_no in range(no_of_tables): chunk_index_list = [] chunk = {} df = tables[table_no].df rows, columns = df.shape for column in range(columns): for row in range(rows): search_query = df[column][row] for title, regex_pattern in title_card_dict.items(): if re.search(r"{}".format(regex_pattern), search_query, re.I): # print(search_query) chunk_index_list.append({ 'title': title, 'index': row }) else: chunk_index_list.append({ 'title': 'NO TITLE', 'index': 0 }) # chunk_index_list = sorted(chunk_index_list,key = lambda x: x['index']) chunk_index_list = sorted(list({ frozenset(list_element.items()): list_element for list_element in chunk_index_list }.values()), key=lambda d: d['index']) for index, title_index_dict in enumerate(chunk_index_list): # print(title_index_dict) try: chunk[title_index_dict['title']] = [ title_index_dict['index'], chunk_index_list[index + 1]['index'] ] except: chunk[title_index_dict['title']] = [ title_index_dict['index'] ] for title, chunk_list in chunk.items(): # print(chunk_list) try: temp_df = df.loc[chunk_list[0]:chunk_list[1] - 1] temp_df = temp_df.reset_index(drop=True) rows, columns = temp_df.shape if columns > 2: out = is_certification_or_nutrition(temp_df) # print('output---------->', out) if out not in ['None']: title = out else: pass chunked_df[title] = temp_df except: temp_df = df.loc[chunk_list[0]:] temp_df = temp_df.reset_index(drop=True) rows, columns = temp_df.shape if columns > 2: out = is_certification_or_nutrition(temp_df) # print('output---------->', out) if out not in ['None']: title = out else: pass # temp_df = temp_df.drop([0],axis='columns') # temp_df.columns = range(temp_df.shape[1]) chunked_df[title] = temp_df for df_title, dataframe in chunked_df.items(): if df_title not in [ "COMPANY CONTACT INFORMATION", "NUTRITION", "NUTRITION_SERVING", "CERTIFICATIONS" ]: rows, columns = dataframe.shape if columns >= 2 and rows > 1: # dataframe preprocessing df_pro = dataframe.applymap(cleaning_unwanted_titles) df_pro = cleaning_unwanted_headers(df_pro) df_pro = noise_removal_1(df_pro) df_pro = df_pro.applymap(cleaning_sub_texts) df_pro = noise_removal_2(df_pro) df_pro = df_pro.applymap(check_nan) df_pro = df_pro.dropna(axis=1, how='all') df_pro = df_pro.dropna(axis=0, how='all') df_pro.columns = range(df_pro.shape[1]) df_pro = df_pro.reset_index(drop=True) df_pro = ffill_block_strategy(df_pro) df_pro[0].fillna(method='ffill', axis=0, inplace=True) df_pro = df_pro.applymap(convert_for_bfill_strategy) df_pro[0].fillna(method='bfill', axis=0, inplace=True) df_pro[0].fillna(method='ffill', axis=0, inplace=True) df_pro.fillna('', inplace=True) content_dict, content_list = normal_content_processing( df_pro) overall_content_list.extend(content_list) if page != '1': plumber_content_list = get_overall_content( input_pdf, page) plumber_content_list = list( set(plumber_content_list)) unmapped_element = [] for plumber_content in plumber_content_list: _, plumb_score = process.extract( plumber_content.lower(), overall_content_list, scorer=fuzz.partial_token_set_ratio)[0] _, plumb_score1 = process.extract( plumber_content.lower(), overall_content_list, scorer=fuzz.ratio)[0] # print(plumber_content,plumb_score) if (plumb_score < 90) or (plumb_score > 90 and plumb_score1 < 70): unmapped_element.append(plumber_content) # unmapped_element = list(set(plumber_content_list)-set(content_list)) for content in unmapped_element: output = base( 'general', model_location).prediction(content) print(output) if output['output'] in ['ingredients']: if 'INGREDIENTS_DECLARATION' in content_dict: content_dict[ 'INGREDIENTS_DECLARATION'].append( {'en': content}) else: content_dict[ 'INGREDIENTS_DECLARATION'] = [{ 'en': content }] else: if 'unmapped' in content_dict: content_dict['unmapped'].append( {'en': content}) else: content_dict['unmapped'] = [{ 'en': content }] # print('****' * 5) # print('unmapped element----->',unmapped_element) # print('content list----->',content_list) # print('plumber content list-------->',plumber_content_list) # print('*******' * 6) page_dict = {**page_dict, **content_dict} else: pass else: rows, columns = dataframe.shape if columns >= 2 and rows >= 1: # if df_title == "NUTRITION": if "NUTRITION" in df_title: nutrition_data = nutrition_processing(dataframe) try: if 'serving size' in page_dict: page_dict['serving size'].append({ 'en': nutrition_data['serving size'][0] }) else: page_dict['serving size'] = [{ 'en': nutrition_data['serving size'][0] }] nutrition_data.pop('serving size', None) except: pass try: if 'varied' in page_dict: page_dict['varied'].append( {'en': nutrition_data['varied'][0]}) else: page_dict['varied'] = [{ 'en': nutrition_data['varied'][0] }] nutrition_data.pop('varied', None) except: pass if nutrition_data: if 'NUTRITION_FACTS' in page_dict: page_dict['NUTRITION_FACTS'].append( nutrition_data) else: page_dict['NUTRITION_FACTS'] = [ nutrition_data ] # print(nutrition_data) elif df_title == "CERTIFICATIONS": # print('inside certification') df_pro = dataframe.applymap( cleaning_unwanted_titles) df_pro = cleaning_unwanted_headers(df_pro) df_pro = noise_removal_1(df_pro) df_pro = df_pro.applymap(cleaning_sub_texts) df_pro = noise_removal_2(df_pro) df_pro = df_pro.applymap(check_nan) df_pro = df_pro.dropna(axis=1, how='all') df_pro = df_pro.dropna(axis=0, how='all') df_pro.columns = range(df_pro.shape[1]) df_pro = df_pro.reset_index(drop=True) df_pro[0].fillna(method='ffill', axis=0, inplace=True) df_pro[0].fillna(method='bfill', axis=0, inplace=True) df_pro.fillna('', inplace=True) certification_data = certifications_processing( df_pro) page_dict = {**page_dict, **certification_data} final_dict[page] = page_dict try: temp_directory.cleanup() except: pass return final_dict
def search_columns(self, colname, limit=5): from fuzzywuzzy import process extracted = process.extract(colname, self.all_columns.keys(), limit=limit) return [x[0] for x in extracted]
correct_with_rating = [] #initial csvs are read in here, madris dataframe is the one most likely to be altered df_madris = pd.read_csv("madris.csv") df_harvard = pd.read_csv("my_harvard.csv") df_madris['DEGREE DESCR'] = 'X' + df_madris['DEGREE DESCR'].astype(str) df_madris['DEGREE DESCR'] = df_madris['DEGREE DESCR'].str.upper() df_temp_madris = df_madris['DEGREE DESCR'].values df_temp_harvard = df_harvard['Degree'].values #using extract rather than extractone shows first and second best fit to the tested data for row in df_temp_madris: x = process.extract(row, df_temp_harvard, limit=2) correct_with_rating.append({ "Correct_Degree_1(my.harvard)": x[0], "Tested_Degree(madris)": row.strip(" "), "Correct_Degree_2(my.harvard)": x[1], "Rating_best_fit": x[0][1] }) df_processed_data = pd.DataFrame(correct_with_rating) df_processed_data = df_processed_data.sort_values(by="Rating_best_fit", ascending=False) # columnsTitles=["Correct_Degree(my.harvard)","Tested_Degree(madris)","Rating"] # df_processed_data=df_processed_data.reindex(columns=columnsTitles) print(df_processed_data)
for key in pdk45_csv.keys(): if key != "name": layers45.append(key.split('LAYER')[0]) layers15 = [] for key in pdk15_csv.keys(): if key != "name": layers15.append(key.split('LAYER')[0]) final_pairs = [] list1 = layers45 list2 = layers15 while True: pairs = [] for key in list1: result = process.extract(key, list2, limit=2, scorer=scorer) match = result[0][0] score = result[0][1] pairs.append((key, match, score)) # print(key, ':', match, score) max_score = 50 perfect_match = False best_pair = [] for pair in pairs: key, match, score = pair if score == 100: perfect_match = True list1.remove(key) list2.remove(match) print("matching", key, match, score)
def company_name(content_and_scores): content = content_and_scores[0] sentiment = content_and_scores[1] wordtoke = nltk.word_tokenize(content) wordtag = nltk.pos_tag(wordtoke) nouns = findtags("NN", wordtag) main_noun = nouns['NNP'][0][0] #print(main_noun) poscomp = [] pronouns = nouns['NNP'] try: pronouns_s = nouns['NNPS'] for pronoun in pronouns: #print(pronoun[0]) poscomp.append(pronoun[0]) for p in pronouns_s: poscomp.append(p[0]) #print(pronouns) except: for pronoun in pronouns: #print(pronoun[0]) poscomp.append(pronoun[0]) #print(poscomp) path1 = "/Users/Master Soe/webscrap/Stock Companies/A-Z_companies.csv" f = open(path1, newline='') reader = csv.reader(f) data = [row for row in reader] path2 = "/Users/Master Soe/webscrap/Stock Companies/companynames.txt" k = open(path2, newline='') text = k.read() k.close() company = poscomp badwordlist = [ "Corporation", "Co.", "Incorporated", "Inc.", "Company", "Communications" "Fund", "Trust", "Investment", "Associates", "NYSE", "NASDAQ", "Stock", "Securities", "Bloomberg" ] #try: #for comp in poscomp: #if re.search(comp, text): #company.append(comp) #print(company) #raise StopIteration #else: #pass #print("No company found and Sam is f****t") #except StopIteration: #pass #print(company) scores1 = [] companylist = [] TICKERlist = [] # Gets data from data lol for companydetails in data: #print(companydetails[1]) companylist.append(companydetails[1]) TICKERlist.append(companydetails[0]) # Checks if there are any common words in the list for badword in badwordlist: for fx in company: if re.search(fx, badword): company.remove(fx) else: pass # Removes \\ words for x in company: if re.search('\\\\', x): company.remove(x) else: pass for x in company: if re.search('\\\\', x): company.remove(x) else: pass print("SENTIMENT SCORE", sentiment) print("List of possible companies") print(company) # Makes permutations for the word #for l in data: # companylist.append(l[1]) #_gen = (itertools.permutations(company, i + 1) for i in range(len(company))) #all_permutations_gen = itertools.chain(*_gen) #results = [x for x in all_permutations_gen] #k = [] # Arranges the combinations into a readable array #for x in results: # j = "" # for y in x: # j = j + y + " " # k.append(j) #scores2 = [""] #print(k) # Checks the match score of word to a company for x in company: #print(x) if re.search('[a-z]+', x) is None: possible_company_score = process.extract(x, TICKERlist, limit=3) else: possible_company_score = process.extract(x, companylist, limit=3) #print(possible_company_score) #print(companylistx) for x in possible_company_score: scores1.append(x) #print(scores1) c = Counter(scores1) guess_company = c.most_common() #print(guess_company) i = [] for g in guess_company: i.append(g[0]) print(" ") def custom_sort(t): return t[1] i.sort(key=custom_sort, reverse=True) #print(i) print("The stock company(s) is", i[0:5]) #except: #pass return guess_company
def fuzzy_wuzzy(m): movie_title = process.extract(m, movie_titles)[0][0] return movie_title
def didyoumean(input_command): return process.extract(input_command, command_lib.keys(), scorer=fuzz.partial_ratio, limit=1)[0][0]
cou2 = country.sort_values('country_txt') # 原数据中名称字符分隔符删除 cou2['country_txt'] = cou2['country_txt'].str.replace(' ', '') cou = pd.merge(cou1, cou2, left_on=['国家'], right_on=['country_txt'], how='outer') # 将不能匹配的进行模糊匹配 a = cou[cou.isnull().any(axis=1)] a1 = a['国家'].dropna().values.tolist() a2 = a['country_txt'].dropna().values.tolist() dic = dict() for t in a2: x = process.extract(t, a1, limit=2) if (x[0][1] >= 68 or x[0][1] == 50) and (x[0][1] != 72): dic[t] = x[0][0] t = [a for a, b in enumerate(cou['country_txt']) if b in dic] cou.iloc[t, 0] = list(dic.values()) cou = cou.iloc[:, :2].dropna() cou['country'] = cou['country'].astype(int) # 将国家编号并入eco经济数据集 eco1 = pd.merge(eco, cou, on=['国家'], how='right') eco1.rename(columns={'年份': 'iyear'}, inplace=True) eco1 = eco1.drop('国家', axis=1) #将经济数据按照 国家编号、年份汇入 恐怖袭击 表格 eco1 = eco1.sort_values(['iyear', 'country']) data4 = pd.merge(data3, eco1, on=['country', 'iyear'], how='left')
def search(): if request.method == "OPTIONS": # CORS preflight return _build_cors_prelight_response() elif request.method == 'GET': totalpages = 0 content = request.get_json() print(content) Filename = "./contractCollection.csv" df = pd.read_csv(Filename, error_bad_lines=False) df = df.fillna("False") result = [] clause_category = "" tag = "" cont_type = "" text = "" pages = 0 try: print("try") cont_type = content["cont_type"] clause_category = content["clause_category"] tag = content["tag"] text = content["text"] pages = content["page"] except: return _corsify_actual_response(jsonify("parameters error")) if cont_type != "" and clause_category != "": contain_values = df[df['name'].str.contains(cont_type)] result = contain_values[df['ClausesCategories'].str.contains( clause_category)] if (len(result)) == 0: result = [1, 2] elif cont_type != "": result = df[df['name'].str.contains(cont_type)] if (len(result)) == 0: result = [1, 2] elif clause_category != "": result = df[df['ClausesCategories'].str.contains(clause_category)] if (len(result)) == 0: result = [1, 2] else: result = [1, 2] #print ("result",len(result)) if len(result) > 0: Filename = "./ClausesCategoriesCollection.csv" df2 = pd.read_csv(Filename, error_bad_lines=False) df2 = df2.fillna("False") claid = df2["_id"].tolist() claname = df2["name"].tolist() if clause_category != "": #########if category is empty contain_values = df2[df2['name'].str.contains( clause_category)] ##if caluse category exist if len(contain_values) > 0: ids = int(contain_values["_id"]) Filename = "./ClauseCollection.csv" df3 = pd.read_csv(Filename, error_bad_lines=False) df3 = df3.fillna("False") #df3['clauseID']=pd.to_numeric(df3['clauseID']) rows = df3.loc[(df3['tags'] == tag) & (df3['clauseID'] == ids)] data = [] totalpages = math.ceil(len(rows) / 10) print("pages", len(rows), totalpages) rows = rows.iloc[pages:] for index, row in rows.iterrows(): if (len(data) < 10): print("row", row["_id"]) #data.append({"name":row["name"],"description":row["description"]}) data.append({ "description": row["description"], "clause_type": row["name"], "category": claname[claid.index(row["clauseID"])], "tag": row["tags"] }) #print("data",data) if len(data) == 0: rows = df3.loc[(df3['clauseID'] == ids)] data = [] totalpages = math.ceil(len(rows) / 10) print("pages", len(rows), totalpages) rows = rows.iloc[pages:] for index, row in rows.iterrows(): if (len(data) < 10): print("row", row["_id"]) print("calid", claid.index(row["clauseID"])) print("claname", claname[claid.index(row["clauseID"])]) #data.append({"name":row["name"],"description":row["description"]}) data.append({ "description": row["description"], "clause_type": row["name"], "category": claname[claid.index(row["clauseID"])], "tag": row["tags"] }) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) ####return data #print("rows",rows,df3.dtypes) elif tag != "": ####################if category does not exist in records but tag exist #ids=int(contain_values["_id"]) Filename = "./ClauseCollection.csv" df3 = pd.read_csv(Filename, error_bad_lines=False) df3 = df3.fillna("False") #df3['clauseID']=pd.to_numeric(df3['clauseID']) rows = df3.loc[(df3['tags'] == tag)] data = [] totalpages = math.ceil(len(rows) / 10) print("pages", len(rows), totalpages) rows = rows.iloc[pages:] for index, row in rows.iterrows(): if (len(data) < 10): print("calid", claid.index(row["clauseID"])) print("claname", claname[claid.index(row["clauseID"])]) #print("row",row["_id"]) #data.append({"name":row["name"],"description":row["description"]}) data.append({ "description": row["description"], "clause_type": row["name"], "category": claname[claid.index(row["clauseID"])], "tag": row["tags"] }) else: break #print("data tag exist",data) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) ####return data elif text != "": #########tag does not exist but text exist Filename = "./ClauseCollection.csv" df3 = pd.read_csv(Filename, error_bad_lines=False) df3 = df3.fillna("False") entities = df3['name'].tolist() descriptions = df3['description'].tolist() tagss = df3['tags'].tolist() claidss = df3['clauseID'].tolist() results = process.extract(text, descriptions, scorer=fuzz.token_sort_ratio) #print(results) data = [] print(results[0][0], results[0][1]) import random n = random.randint(1, 10) totalpages = 1 print("pages", totalpages) for x in results: if (len(data) < n): #data.append({"name":entities[descriptions.index(x[0])],"description":x[0]}) data.append({ "description": descriptions[claid.index(x[1])], "clause_type": entities[claid.index(x[1])], "category": claname[claidss[claid.index(x[1])]], "tag": tagss[claid.index(x[1])] }) else: break #print("data",data) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) else: ############if text not exist #print("g aya no") return _corsify_actual_response(jsonify({})) elif tag != "": ############if clause category does not exist but tag exist Filename = "./ClauseCollection.csv" df3 = pd.read_csv(Filename, error_bad_lines=False) df3 = df3.fillna("False") #df3['clauseID']=pd.to_numeric(df3['clauseID']) rows = df3.loc[(df3['tags'] == tag)] data = [] totalpages = math.ceil(len(rows) / 10) print("pages", len(rows), totalpages) rows = rows.iloc[pages:] for index, row in rows.iterrows(): if (len(data) < 10): print("row", row["_id"]) print("calid", claid.index(row["clauseID"])) print("claname", claname[claid.index(row["clauseID"])]) #data.append({"name":row["name"],"description":row["description"]}) data.append({ "description": row["description"], "clause_type": row["name"], "category": claname[claid.index(row["clauseID"])], "tag": row["tags"] }) #print("data if tag exist only",len(data)) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) elif text != "": #########tag does not exist but text exist Filename = "./ClauseCollection.csv" df3 = pd.read_csv(Filename, error_bad_lines=False) df3 = df3.fillna("False") entities = df3['name'].tolist() descriptions = df3['description'].tolist() tagss = df3['tags'].tolist() claidss = df3['clauseID'].tolist() results = process.extract(text, descriptions, scorer=fuzz.token_sort_ratio) #print(results) data = [] import random n = random.randint(1, 10) totalpages = 1 print("pages", totalpages) #print(results[0][0],results[0][1]) for x in results: if (len(data) < 10): print("results", results) print("calid", claid.index(x[1])) print("claname", claname[claid.index(x[1])]) #data.append({"name":entities[descriptions.index(x[0])],"description":x[0]}) data.append({ "description": descriptions[claid.index(x[1])], "clause_type": entities[claid.index(x[1])], "category": claname[claidss[claid.index(x[1])]], "tag": tagss[claid.index(x[1])] }) else: break #print("data",data) array2 = [] array2.append({"pages": totalpages}) array2.append({"data": data}) return _corsify_actual_response(jsonify(array2)) else: ############if text not exist return _corsify_actual_response(jsonify({})) #print(clause_category) #data = request.values #print("coming",request.form["id"]) #id=str(request.form["id"]) #print(id,type(id)) #response=model.recommendation(id) else: return _corsify_actual_response(jsonify("error"))
def _get_matches(self, name, match_limit): return process.extract(name, self.friends, scorer=fuzz.UWRatio, limit=match_limit)