def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5): if isinstance(fuzzy_list, pd.Series): fuzzy_list = fuzzy_list.tolist() if isinstance(clean_list, pd.Series): clean_list = clean_list.unique().tolist() index = FuzzySet(rel_sim_cutoff=0.000001) for c in clean_list: index.add(c) out_list = [] for f in fuzzy_list: try: first_word = f.split('_')[0] results = index.get(f) results = [i for i in results if i[1].split('_')[0] == first_word ] # match state at least out_list.append(results[0][1]) # take best result except Exception as e: results = index.get(f) out_list.append(results[0][1]) return out_list
def base_check(self, dict_list, word_list, lem=True): """ Check probability that the word is based on a dictionary word Arguments --------- dict_list: list list containing pre-loaded language dictionary word_list: list list containing the passwords to check lem: Boolean Select whether or not to use lemmatization, disable this for matching countries Returns ------- word, score: tuple Generator where yield is a list of tuples containing base word and score """ fuzz = FuzzySet(dict_list) lemm = WordNetLemmatizer() for word in word_list: word = self.cleaner(word) score = fuzz.get(word.lower()) if lem: try: lem_word = lemm.lemmatize(score[0][1]) except TypeError: lem_word = "" else: lem_word = "" yield (word, score, lem_word)
def run_profile(): f = FuzzySet() with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: for line in input_file: f.add(line.rstrip()) cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats()
def _load(self, parsed_entities): entities = [] for group in parsed_entities.keys(): for element in parsed_entities[group].keys(): fuzzy = FuzzySet() for x in [element] + parsed_entities[group][element]: fuzzy.add(x) entity = { "group": group, "canonical": element, "fuzzy": fuzzy } entities.append(entity) return entities
def load_dictionary(): print("loading dutch dictionary...") opentaal_dict_file = "data/Dutch.dic" fasttext_vocab_file = "data/dutch_vocabulary.txt" words = FuzzySet() for counter, line in tqdm.tqdm(enumerate(open(opentaal_dict_file))): if counter == 0: continue words.add(line.split("/")[0].strip()) for counter, line in tqdm.tqdm(enumerate(open(fasttext_vocab_file))): if counter ==0: continue words.add(line.strip()) return words
class QueryPage(BaseModel): title: str pageid_: set[str] = Field(alias="pageid") categories: set[ValidCategory] = Field(alias="categories") aliases: set[str] = Field(alias="redirects", default_factory=set) _fuzzy: FuzzySet = PrivateAttr(None) def __init__(self, **kwargs): super().__init__(**kwargs) self._fuzzy = FuzzySet([self.title, *self.aliases]) @validator("title", pre=True, allow_reuse=True) def strip_suffixes(cls, title: str): return strip_suffix_from_title(title) @validator("pageid_", pre=True, allow_reuse=True) def ensure_pageid_set(cls, pageid_: Any): if isinstance(pageid_, set): return pageid_ elif isinstance(pageid_, (str, int)): return {pageid_} else: return set(pageid_) @validator("categories", pre=True, allow_reuse=True) def extract_category(cls, categories: dict[str, str]): return {category["title"] for category in categories} @validator("aliases", pre=True, allow_reuse=True) def extract_redirect(cls, redirects: dict[str], values): title = values["title"] return [redirect["title"] for redirect in redirects if redirect["title"] not in title] @property def pageid(self) -> str: return "|".join(self.pageid_) def update(self, other: QueryPage) -> None: if self.title != other.title: raise KeyError("Cannot merge two pages with different titles.") self.pageid_.update(other.pageid_) self.categories.update(other.categories) self.aliases.update(other.aliases) for alias in other.aliases: self._fuzzy.add(alias)
def _load(self, parsed_entities): entities = [] for group in parsed_entities: group_name = group["name"] for group_element in group["subLists"]: fuzzy = FuzzySet() for x in [group_element["canonicalForm"] ] + group_element["list"]: fuzzy.add(x) entity = { "group": group_name, "canonical": group_element["canonicalForm"], "fuzzy": fuzzy } entities.append(entity) return entities
def _get_entity_groups(self, database_config: Dict[Text, Text], database_queries: Dict[Text, Text]): db = pymysql.connect(host=database_config["host"], user=database_config["user"], passwd=database_config["password"], db=database_config["database"]) cur = db.cursor() print(f"Queries are: {database_queries.keys()}") for entity_key in database_queries.keys(): cur.execute(database_queries[entity_key]) current_entity = FuzzySet() for row in cur.fetchall(): if len(row) != 1: raise SyntaxError( f"{entity_key}: query returned more than one column!") current_entity.add(row[0]) self.ents[entity_key] = current_entity db.close()
def __init__(self, use_fuzzy_matching=True): self.country_lookup = CountryCodeLookup() if not isdir(self.GRID_DATA_ROOT): mkdir(self.GRID_DATA_ROOT) if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)): sucess = self.__download_dataset() if not sucess: raise Exception('Failed downloading grid dataset from https://www.grid.ac/') if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)): csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV) data = self.__load_csv(csv_path) self.data_dict = self.__get_dict_from_pd(data) self.__save_dict(self.data_dict) else: self.data_dict = self.__load_dict() self.use_fuzzy_matching = use_fuzzy_matching if use_fuzzy_matching: self.fuzzy_set = FuzzySet() [self.fuzzy_set.add(x) for x in self.data_dict];
from cfuzzyset import cFuzzySet as FuzzySet from os import listdir from os.path import isfile, join # read in player-position map player_to_position_map = {} with open('./player_positions.tsv','r') as f: content = f.readlines() for player in content: player = player.rstrip().split('\t') name = player[0] position = player[1] player_to_position_map[name.lower()] = position.replace(',','/') # use fuzzy set to match player names player_fuzzy_set = FuzzySet(player_to_position_map.keys()) # read through game-play data seasons = ['2006-2007', '2007-2008', '2008-2009', '2009-2010'] for season in seasons: dir_path = './'+season+'.regular_season' game_files = [ f for f in listdir(dir_path) if isfile(join(dir_path,f)) ] season_makes_file = open(season+'.made_shots.csv','w') season_misses_file = open(season+'.missed_shots.csv','w') for game in game_files: with open(dir_path+"/"+game, 'r') as in_file: content = in_file.readlines() for play in content[1:]: play = play.rstrip().split(',') if len(play)<32: continue
def _interactive_test(): with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: f = FuzzySet((line.strip() for line in input_file)) while True: town = raw_input("Enter town name: ") print f.get(town)
class GridLookup: GRID_DATASET_ZIP_NAME = 'grid.zip' GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379' GRID_DIR = dirname(realpath(__file__)) GRID_DATA_ROOT = join(GRID_DIR, 'data') GRID_DATA_CSV = 'grid.csv' GRID_DATA_DICT = 'grid_dict.pkl' def __init__(self, use_fuzzy_matching=True): self.country_lookup = CountryCodeLookup() if not isdir(self.GRID_DATA_ROOT): mkdir(self.GRID_DATA_ROOT) if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)): sucess = self.__download_dataset() if not sucess: raise Exception('Failed downloading grid dataset from https://www.grid.ac/') if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)): csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV) data = self.__load_csv(csv_path) self.data_dict = self.__get_dict_from_pd(data) self.__save_dict(self.data_dict) else: self.data_dict = self.__load_dict() self.use_fuzzy_matching = use_fuzzy_matching if use_fuzzy_matching: self.fuzzy_set = FuzzySet() [self.fuzzy_set.add(x) for x in self.data_dict]; def __download_dataset(self): try: zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME) download_file(self.GRID_DATASET_URL, zip_file) self.__extract_zip(zip_file) remove(zip_file) return True except: return False def __extract_zip(self, zip_file): with ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(self.GRID_DATA_ROOT) def __load_csv(self, path): return pd.read_csv(path) def __get_dict_from_pd(self, data): data_dict = dict() for _, row in data.iterrows(): code = self.country_lookup.get_country_code(row.Country) data_dict[row.Name] = { 'Name': row.Name, 'Country': row.Country, 'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea) return data_dict def __save_dict(self, grid_dict): with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f: pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL) def __load_dict(self): with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f: return pickle.load(f) def __fuzzy_match_institution(self, name): result = self.fuzzy_set.get(name) if result is None or len(result) == 0: return None score, match = result[0] return match if score > 0.90 else None def get_institution(self, name): if name is None: return None institution = self.data_dict.get(name) if self.use_fuzzy_matching and institution is None: matched_name = self.__fuzzy_match_institution(name) if matched_name is None: return None return self.data_dict.get(matched_name) return institution def get_all_institutions(self): return self.data_dict.keys()
sys.exit(2) for opt,arg in opts: if opt == "-h": print "homophonic.py -t <text> -l <method>" elif opt in ("-t","--text"): text = arg elif opt in ("-l","--method"): method = arg else: sys.exit(2) text = ap_encoding.read_file(text) if method == "translate": _start = time.time() results = get_translation(text) elif method == "phonics": results = get_phonics(text) _end = time.time() #print _end - _start print ' '.join([val for val in results]) init_sphinx() words = {} phones = FuzzySet() for word, phone in pronunciations: try: words[phone] = word except KeyError: words[phone] += word [phones.add(phone) for word, phone in pronunciations] if __name__ == "__main__": main(sys.argv[1:])
def _interactive_test(): with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: f = FuzzySet((line.strip() for line in input_file)) while True: town = input("Enter town name: ") print(f.get(town))
def __init__(self, **kwargs): super().__init__(**kwargs) self._fuzzy = FuzzySet([self.title, *self.aliases])
#!/user/bin/env python # coding:utf-8 from cfuzzyset import cFuzzySet as FuzzySet import re ceo_list = [] ceo_edu_dic = {} a = FuzzySet() count = 0 def check_name(name): name_tmp = name.strip().split() if len(name) <= 5: return False if len(name_tmp) == 1: return False if name_tmp[0] == name_tmp[1]: return False else: return True def clean_name(name): todel = [i.start() for i in re.finditer('"', name)] if len(todel) == 2: myString = name[0:todel[0]] + name[todel[1] + 2:-1] #print name + ' ---> ' + myString return myString