def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5): if isinstance(fuzzy_list, pd.Series): fuzzy_list = fuzzy_list.tolist() if isinstance(clean_list, pd.Series): clean_list = clean_list.unique().tolist() index = FuzzySet(rel_sim_cutoff=0.000001) for c in clean_list: index.add(c) out_list = [] for f in fuzzy_list: try: first_word = f.split('_')[0] results = index.get(f) results = [i for i in results if i[1].split('_')[0] == first_word ] # match state at least out_list.append(results[0][1]) # take best result except Exception as e: results = index.get(f) out_list.append(results[0][1]) return out_list
def base_check(self, dict_list, word_list, lem=True): """ Check probability that the word is based on a dictionary word Arguments --------- dict_list: list list containing pre-loaded language dictionary word_list: list list containing the passwords to check lem: Boolean Select whether or not to use lemmatization, disable this for matching countries Returns ------- word, score: tuple Generator where yield is a list of tuples containing base word and score """ fuzz = FuzzySet(dict_list) lemm = WordNetLemmatizer() for word in word_list: word = self.cleaner(word) score = fuzz.get(word.lower()) if lem: try: lem_word = lemm.lemmatize(score[0][1]) except TypeError: lem_word = "" else: lem_word = "" yield (word, score, lem_word)
# read through game-play data seasons = ['2006-2007', '2007-2008', '2008-2009', '2009-2010'] for season in seasons: dir_path = './'+season+'.regular_season' game_files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] season_makes_file = open(season+'.made_shots.csv','w') season_misses_file = open(season+'.missed_shots.csv','w') for game in game_files: with open(dir_path+"/"+game, 'r') as in_file: content = in_file.readlines() for play in content[1:]: play = play.rstrip().split(',') if len(play)<32: continue play_type = play[13] if play_type == 'shot' and not (play[30]=='' or play[31]==''): # we don't have to keep points # if result=='made' and play_type=='3pt', then 3 points. player = play[23] result = play[27] shot_type = play[29] x_coord = int(play[30]) y_coord = int(play[31]) position = player_to_position_map[player_fuzzy_set.get(player.lower())[0][1]] if result == 'made': season_makes_file.write(",".join([player, position, shot_type, str(x_coord), str(y_coord)])+"\n") else: season_misses_file.write(",".join([player, position, shot_type, str(x_coord), str(y_coord)])+"\n") season_makes_file.close() season_misses_file.close()
def _interactive_test(): with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: f = FuzzySet((line.strip() for line in input_file)) while True: town = raw_input("Enter town name: ") print f.get(town)
class GridLookup: GRID_DATASET_ZIP_NAME = 'grid.zip' GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379' GRID_DIR = dirname(realpath(__file__)) GRID_DATA_ROOT = join(GRID_DIR, 'data') GRID_DATA_CSV = 'grid.csv' GRID_DATA_DICT = 'grid_dict.pkl' def __init__(self, use_fuzzy_matching=True): self.country_lookup = CountryCodeLookup() if not isdir(self.GRID_DATA_ROOT): mkdir(self.GRID_DATA_ROOT) if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)): sucess = self.__download_dataset() if not sucess: raise Exception('Failed downloading grid dataset from https://www.grid.ac/') if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)): csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV) data = self.__load_csv(csv_path) self.data_dict = self.__get_dict_from_pd(data) self.__save_dict(self.data_dict) else: self.data_dict = self.__load_dict() self.use_fuzzy_matching = use_fuzzy_matching if use_fuzzy_matching: self.fuzzy_set = FuzzySet() [self.fuzzy_set.add(x) for x in self.data_dict]; def __download_dataset(self): try: zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME) download_file(self.GRID_DATASET_URL, zip_file) self.__extract_zip(zip_file) remove(zip_file) return True except: return False def __extract_zip(self, zip_file): with ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(self.GRID_DATA_ROOT) def __load_csv(self, path): return pd.read_csv(path) def __get_dict_from_pd(self, data): data_dict = dict() for _, row in data.iterrows(): code = self.country_lookup.get_country_code(row.Country) data_dict[row.Name] = { 'Name': row.Name, 'Country': row.Country, 'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea) return data_dict def __save_dict(self, grid_dict): with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f: pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL) def __load_dict(self): with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f: return pickle.load(f) def __fuzzy_match_institution(self, name): result = self.fuzzy_set.get(name) if result is None or len(result) == 0: return None score, match = result[0] return match if score > 0.90 else None def get_institution(self, name): if name is None: return None institution = self.data_dict.get(name) if self.use_fuzzy_matching and institution is None: matched_name = self.__fuzzy_match_institution(name) if matched_name is None: return None return self.data_dict.get(matched_name) return institution def get_all_institutions(self): return self.data_dict.keys()
def _interactive_test(): with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: f = FuzzySet((line.strip() for line in input_file)) while True: town = input("Enter town name: ") print(f.get(town))
#outfile = open('2007-2017_edu_test.csv', 'w') for n, orgline in enumerate(infile): if n == 0: continue #if n > 100: # break if n % 1000 == 0: print n line = orgline.strip().split(',') fullname = line[-1] #fullname = line[1] if fullname not in target_edu_dic: target_edu_dic[fullname] = 'None' try: #result = process.extractOne(fullname, ceo_list, scorer=fuzz.token_set_ratio) result = a.get(clean_sin_charac(fullname))[0] #print fullname #print result except: pass if result == None: continue if result[0] > 0.5: print fullname print result count += 1 target_edu_dic[fullname] = ceo_edu_dic[result[1]] print >> outfile, orgline.strip() + ',' + target_edu_dic[fullname] print "#########################" print count