def test_dedupe(self): """We should be able to use a list-like object for contains_dupes """ ## Test 1 contains_dupes = [ "Frodo Baggins", "Tom Sawyer", "Bilbo Baggin", "Samuel L. Jackson", "F. Baggins", "Frody Baggins", "Bilbo Baggins", ] result = process.dedupe(contains_dupes) self.assertTrue(len(result) < len(contains_dupes)) ## Test 2 contains_dupes = ["Tom", "Dick", "Harry"] # we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned) deduped_list = ["Tom", "Dick", "Harry"] result = process.dedupe(contains_dupes) self.assertEqual(result, deduped_list)
def fuzzy_search(self, file): expected_org_list = [ 'High Top Brewing', 'Holiday Inn Hotel Washington' ] df = pd.read_csv(file) print(df.head(-1)) print('') org_data = [row[0] for row in df.values] process.dedupe(org_data, threshold=80) print("The expected companies : " + str(expected_org_list) + "\n") print('The companies are matched as follows.') for query in expected_org_list: #scorer could be ratio, partial_ratio, token_sort_ratio or token_set_ratio #result = process.extract(query, org_data, scorer=fuzz.partial_ratio, limit=2) result = process.extractBests(query, org_data, scorer=fuzz.partial_ratio, score_cutoff=70, limit=2) print(result)
def merge_school(resume, ent): a = extract_university_google(resume, ent) tokenizer = RegexpTokenizer(r'\w+') resume_token = tokenizer.tokenize(resume) resume_token_lower = [item.lower() for item in resume_token] university_df1 = pandas.read_excel( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'www', 'Parsing', 'China_University.xlsx'))) university_df2 = pandas.read_excel( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'www', 'Parsing', 'India_University.xlsx'))) university_df3 = pandas.read_excel( os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', 'www', 'Parsing', 'US_University.xlsx'))) university_file1 = university_df1['Universities'].values university_file2 = university_df2['Universities'].values university_file3 = university_df3['Universities'].values university_lower1 = [item.lower() for item in university_file1] university_lower2 = [item.lower() for item in university_file2] university_lower3 = [item.lower() for item in university_file3] university_combined = university_lower1 + university_lower2 + university_lower3 b = extract_university(resume_token_lower, university_combined) c = [item.lower() for item in a] d = list(dedupe(b + c)) return (d)
def remove_dupe_neighbours(self, neighbours): """ Removes duplicates from list of restaurant neighbours """ # Remove names that are very similar - use fuzzy comparison name_list = [ self.restaurant_name_df['name'][self.restaurant_name_df.index == x[0]].values[0] for x in neighbours ] name_list_no_dupes = list(process.dedupe(name_list, threshold=90)) elim = list(set(name_list) - set(name_list_no_dupes)) # Names that were eliminated index_list = [x[0] for x in neighbours] subset = self.restaurant_name_df.loc[index_list] subset = subset[~subset['name'].isin(elim)] # Remove exact duplicates exact_dupe_names = subset[subset['name'].duplicated(keep=False)] exact_dupe_names = exact_dupe_names.sort_values( by=['name', 'stars'], ascending=True) # Drop match with fewest stars exact_dupe_names = exact_dupe_names.iloc[::2, :] subset.drop(index=exact_dupe_names.index, inplace=True) neighbours_no_dupes = [ neighbour_tuple for neighbour_tuple in neighbours if neighbour_tuple[0] in subset.index ] return neighbours_no_dupes
def filter_text(text): filtered_text = dict() for sentence in text: if len(sentence) > 40: filtered_text[sentence] = None filtered_text = list(filtered_text.keys()) deduped_text = list(dedupe(filtered_text)) return deduped_text
def test_dedupe(self): """We should be able to use a list-like object for contains_dupes """ # Test 1 contains_dupes = ['Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson', 'F. Baggins', 'Frody Baggins', 'Bilbo Baggins'] result = process.dedupe(contains_dupes) self.assertTrue(len(result) < len(contains_dupes)) # Test 2 contains_dupes = ['Tom', 'Dick', 'Harry'] # we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned) deduped_list = ['Tom', 'Dick', 'Harry'] result = process.dedupe(contains_dupes) self.assertEqual(result, deduped_list)
def test_dedupe(self): """We should be able to use a list-like object for contains_dupes """ # Test 1 contains_dupes = ['Frodo Baggins', 'Tom Sawyer', 'Bilbo Baggin', 'Samuel L. Jackson', 'F. Baggins', 'Frody Baggins', 'Bilbo Baggins'] result = process.dedupe(contains_dupes) self.assertTrue(len(result) < len(contains_dupes)) # Test 2 contains_dupes = ['Tom', 'Dick', 'Harry'] # we should end up with the same list since no duplicates are contained in the list (e.g. original list is returned) deduped_list = ['Tom', 'Dick', 'Harry'] result = process.dedupe(contains_dupes) self.assertEqual(result, deduped_list)
def find_exp_header(resume, experience_list): exp_header_list = [] for word in experience_list: if resume.find(word) != -1: exp_header_list.append(word) #remove duplicates of experience header exp_header = list(dedupe(exp_header_list)) return exp_header
def get_name_list_per_issue(ann_file): """ Remove depulicate names in the name lists per issue""" with open(ann_file, encoding='utf8') as f: data = f.read().splitlines() new_data = [] for line in data: line = line.replace('\t', ' ') new = line.split(' ') new_data.append(new) results = [' '.join(t[4:]) for t in new_data if t[1] == 'PER'] results_dedupe = list(process.dedupe(results, threshold=80)) return results_dedupe
def fuzzy_movie_search(self, file): df = pd.read_csv(file) print(df.head(-1)) print('') misspelled_film_list = [row[0] for row in df.values] movie_data = [row[1] for row in df.values] process.dedupe(movie_data, threshold=80) print("The misspelled movies: " + str(misspelled_film_list) + "\n") print('The movies are matched as follows.') for query in misspelled_film_list: #scorer could be ratio, partial_ratio, token_sort_ratio or token_set_ratio result = process.extractBests(query, movie_data, scorer=fuzz.partial_ratio, score_cutoff=70, limit=2) print("\'" + query + "\' matches with the films with scores: " + str(result))
def dedupe(self, threshold=95): """dedupe(threshold=95) Produces a fuzzily de-duplicated version of the candidate addresses, using :code:`fuzzywuzzy.proccess.dedupe`. Note: See https://github.com/seatgeek/fuzzywuzzy/blob/master/fuzzywuzzy/process.py for detail on the deduplication algorithm implementation. This method does not modify the :code:`Address.addresses`. property. Kwargs: threshold (int): the numerical value (0,100) point at which you expect to find duplicates. Defaults to 95 out of 100, which is higher than the fuzzywuzzy default (70); this higher threshold is used by defauly since addresses are more sensitive to small changes (e.g. "250 Main Street" and "150 Main Street" have a small edit distance when considered as strings, but may have a reasonably large physical distance when considered as physical addresses). Returns: A list of :code:`geopy.location.Location` objects (essentially a filtered list of the original set). """ return fuzzyprocess.dedupe([str(a) for a in self.addresses], threshold)
def extract_major(majors_minors_all): majors_minors_all = updated_majors1 + updated_majors2 + updated_majors3 + updated_majors4 majors_minors_final_list = list(dedupe(majors_minors_all)) return (majors_minors_final_list)
element = items.split() if len(element) == 1: target_list.append(items) #print("target_list",target_list) for element in source_list: for vale in element: if vale in target_list: target_list.remove(vale) # print(vale) #target_list.remove(vale) #print("Target_list",target_list) list3 = source_list + target_list #print(list3) #combining the strings for index, value in enumerate(list3): if isinstance(value, list): list3[index] = " ".join(value) print("step :3 displaying list3 combined", list3) # Applying fuzzy again for ind, varn in enumerate(list3): new_list2.insert(ind, correct_road(list3, varn, ind)) flist = set(new_list2) print("step :4 displaying new_list2", flist) listx = process.dedupe(flist, threshold=40) print(list(listx))
def execute_extract(req: ExtractRequest, ctx) -> ExtractResponse: summary = ctx['services'].extractive_summarizer.extract_summary(req.text) summary_sents = sent_tokenize(summary) deduped_summary_sents = list(process.dedupe(summary_sents)) return ExtractResponse(sentences=deduped_summary_sents)
def create(): venue_id = request.json.get('venue_id') access_token = request.json.get('access_token') global sp sp = spotipy.Spotify(auth=access_token) # get username from current user after auth username = sp.me()['id'] print(f'\nSpotify User: {username}') global USERNAME USERNAME = username print("\n>>> ShowQuester started.") print(f"*****************{venue_name}*******************") #venue_id = get_venue_id(venue_name, venue_city) if venue_id: event_list = get_venue_events(venue_id) shows = events_df(event_list) artist_list = list(shows.artist) # remove exact duplicates from list of artists artist_list = list( process.dedupe(artist_list, threshold=99, scorer=fuzz.token_sort_ratio)) my_playlists = get_my_public_playlists(USERNAME) # search playlists for a SoundQuester venue playlist venue_playlist = [(name, uri) for name, uri in my_playlists.items() if venue_name in name] if venue_playlist: playlist_name, playlist_uri = venue_playlist[0] print( f'Found ShowQuester playlist for {venue_name} named "{playlist_name}"' ) else: # if venue playlist missing, create new SoundQuester playlist print(f'No playlist found for "{venue_name}"') playlist_name, playlist_uri = create_sq_playlist( venue_name, venue_city, venue_state) # derive playlist_id from playlist_uri playlist_id = playlist_uri.split(':')[2] # retrieve all artist objects for performing artists artist_obj = [] print("...SEARCHING Spotify for ARTISTS...") for artist in artist_list: artist_obj.append(get_artist(artist)) # pull one top track per artist to be added to playlist tracks_to_add = [] print("...SEARCHING Spotify for TOP TRACKS...") for artist in artist_obj: if artist is not None: artist_uri = artist['uri'] track = get_top_track(artist_uri) tracks_to_add.append(track) # filter out empty strings where no track was found tracks_to_add = list(filter(None, tracks_to_add)) # batch tracks into 100's to respect Spotify Web API limits track_batches = list(chunks(tracks_to_add, 100)) print('...UPDATING SHOWQUESTER PLAYLIST TRACKS...') for batch_num, track_uris in enumerate(track_batches): if batch_num == 0: result = sp.user_playlist_replace_tracks( USERNAME, playlist_id, track_uris) else: results = sp.user_playlist_add_tracks(USERNAME, playlist_id, track_uris) print('...UPDATING SHOWQUESTER PLAYLIST DESCRIPTION...') playlist_descr = build_playlist_description(venue_name, venue_url, venue_city, venue_state) results = update_playlist_details(playlist_id, playlist_name, playlist_descr) print(f"COMPLETED: {venue_name}\n\n") return "Done"