def fuzzy_address_matcher(fuzzy_list, clean_list, thresh=0.5): if isinstance(fuzzy_list, pd.Series): fuzzy_list = fuzzy_list.tolist() if isinstance(clean_list, pd.Series): clean_list = clean_list.unique().tolist() index = FuzzySet(rel_sim_cutoff=0.000001) for c in clean_list: index.add(c) out_list = [] for f in fuzzy_list: try: first_word = f.split('_')[0] results = index.get(f) results = [i for i in results if i[1].split('_')[0] == first_word ] # match state at least out_list.append(results[0][1]) # take best result except Exception as e: results = index.get(f) out_list.append(results[0][1]) return out_list
def run_profile(): f = FuzzySet() with gzip.GzipFile(os.path.join(here, '..', 'cities.gz')) as input_file: for line in input_file: f.add(line.rstrip()) cProfile.runctx("profiler(f)", globals(), locals(), "Profile.prof") s = pstats.Stats("Profile.prof") s.strip_dirs().sort_stats("time").print_stats()
def _load(self, parsed_entities): entities = [] for group in parsed_entities.keys(): for element in parsed_entities[group].keys(): fuzzy = FuzzySet() for x in [element] + parsed_entities[group][element]: fuzzy.add(x) entity = { "group": group, "canonical": element, "fuzzy": fuzzy } entities.append(entity) return entities
def load_dictionary(): print("loading dutch dictionary...") opentaal_dict_file = "data/Dutch.dic" fasttext_vocab_file = "data/dutch_vocabulary.txt" words = FuzzySet() for counter, line in tqdm.tqdm(enumerate(open(opentaal_dict_file))): if counter == 0: continue words.add(line.split("/")[0].strip()) for counter, line in tqdm.tqdm(enumerate(open(fasttext_vocab_file))): if counter ==0: continue words.add(line.strip()) return words
class QueryPage(BaseModel): title: str pageid_: set[str] = Field(alias="pageid") categories: set[ValidCategory] = Field(alias="categories") aliases: set[str] = Field(alias="redirects", default_factory=set) _fuzzy: FuzzySet = PrivateAttr(None) def __init__(self, **kwargs): super().__init__(**kwargs) self._fuzzy = FuzzySet([self.title, *self.aliases]) @validator("title", pre=True, allow_reuse=True) def strip_suffixes(cls, title: str): return strip_suffix_from_title(title) @validator("pageid_", pre=True, allow_reuse=True) def ensure_pageid_set(cls, pageid_: Any): if isinstance(pageid_, set): return pageid_ elif isinstance(pageid_, (str, int)): return {pageid_} else: return set(pageid_) @validator("categories", pre=True, allow_reuse=True) def extract_category(cls, categories: dict[str, str]): return {category["title"] for category in categories} @validator("aliases", pre=True, allow_reuse=True) def extract_redirect(cls, redirects: dict[str], values): title = values["title"] return [redirect["title"] for redirect in redirects if redirect["title"] not in title] @property def pageid(self) -> str: return "|".join(self.pageid_) def update(self, other: QueryPage) -> None: if self.title != other.title: raise KeyError("Cannot merge two pages with different titles.") self.pageid_.update(other.pageid_) self.categories.update(other.categories) self.aliases.update(other.aliases) for alias in other.aliases: self._fuzzy.add(alias)
def _load(self, parsed_entities): entities = [] for group in parsed_entities: group_name = group["name"] for group_element in group["subLists"]: fuzzy = FuzzySet() for x in [group_element["canonicalForm"] ] + group_element["list"]: fuzzy.add(x) entity = { "group": group_name, "canonical": group_element["canonicalForm"], "fuzzy": fuzzy } entities.append(entity) return entities
def _get_entity_groups(self, database_config: Dict[Text, Text], database_queries: Dict[Text, Text]): db = pymysql.connect(host=database_config["host"], user=database_config["user"], passwd=database_config["password"], db=database_config["database"]) cur = db.cursor() print(f"Queries are: {database_queries.keys()}") for entity_key in database_queries.keys(): cur.execute(database_queries[entity_key]) current_entity = FuzzySet() for row in cur.fetchall(): if len(row) != 1: raise SyntaxError( f"{entity_key}: query returned more than one column!") current_entity.add(row[0]) self.ents[entity_key] = current_entity db.close()
class GridLookup: GRID_DATASET_ZIP_NAME = 'grid.zip' GRID_DATASET_URL = 'https://digitalscience.figshare.com/ndownloader/files/22091379' GRID_DIR = dirname(realpath(__file__)) GRID_DATA_ROOT = join(GRID_DIR, 'data') GRID_DATA_CSV = 'grid.csv' GRID_DATA_DICT = 'grid_dict.pkl' def __init__(self, use_fuzzy_matching=True): self.country_lookup = CountryCodeLookup() if not isdir(self.GRID_DATA_ROOT): mkdir(self.GRID_DATA_ROOT) if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV)): sucess = self.__download_dataset() if not sucess: raise Exception('Failed downloading grid dataset from https://www.grid.ac/') if not isfile(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT)): csv_path = join(self.GRID_DATA_ROOT, self.GRID_DATA_CSV) data = self.__load_csv(csv_path) self.data_dict = self.__get_dict_from_pd(data) self.__save_dict(self.data_dict) else: self.data_dict = self.__load_dict() self.use_fuzzy_matching = use_fuzzy_matching if use_fuzzy_matching: self.fuzzy_set = FuzzySet() [self.fuzzy_set.add(x) for x in self.data_dict]; def __download_dataset(self): try: zip_file = join(self.GRID_DATA_ROOT, self.GRID_DATASET_ZIP_NAME) download_file(self.GRID_DATASET_URL, zip_file) self.__extract_zip(zip_file) remove(zip_file) return True except: return False def __extract_zip(self, zip_file): with ZipFile(zip_file, 'r') as zip_ref: zip_ref.extractall(self.GRID_DATA_ROOT) def __load_csv(self, path): return pd.read_csv(path) def __get_dict_from_pd(self, data): data_dict = dict() for _, row in data.iterrows(): code = self.country_lookup.get_country_code(row.Country) data_dict[row.Name] = { 'Name': row.Name, 'Country': row.Country, 'Code': code if code is not None else 'undefined'} #TODO: Fix missing country codes (e.g. South Korea) return data_dict def __save_dict(self, grid_dict): with open(join(self.GRID_DATA_ROOT, self.GRID_DATA_DICT), 'wb') as f: pickle.dump(grid_dict, f, pickle.HIGHEST_PROTOCOL) def __load_dict(self): with open(join(self.GRID_DATA_ROOT, 'grid_dict.pkl'), 'rb') as f: return pickle.load(f) def __fuzzy_match_institution(self, name): result = self.fuzzy_set.get(name) if result is None or len(result) == 0: return None score, match = result[0] return match if score > 0.90 else None def get_institution(self, name): if name is None: return None institution = self.data_dict.get(name) if self.use_fuzzy_matching and institution is None: matched_name = self.__fuzzy_match_institution(name) if matched_name is None: return None return self.data_dict.get(matched_name) return institution def get_all_institutions(self): return self.data_dict.keys()
sys.exit(2) for opt,arg in opts: if opt == "-h": print "homophonic.py -t <text> -l <method>" elif opt in ("-t","--text"): text = arg elif opt in ("-l","--method"): method = arg else: sys.exit(2) text = ap_encoding.read_file(text) if method == "translate": _start = time.time() results = get_translation(text) elif method == "phonics": results = get_phonics(text) _end = time.time() #print _end - _start print ' '.join([val for val in results]) init_sphinx() words = {} phones = FuzzySet() for word, phone in pronunciations: try: words[phone] = word except KeyError: words[phone] += word [phones.add(phone) for word, phone in pronunciations] if __name__ == "__main__": main(sys.argv[1:])
if len(item) > 1: final_name = final_name + " " + item return final_name.strip() #Import Existing Info infile = open('ceo_crawled_education.csv') for n, line in enumerate(infile): if n == 0: continue line = line.strip().split(',') name = clean_name(line[0].upper()) if check_name(name): education = line[3] #ceo_list.append(name) a.add(name) ceo_edu_dic[name] = education infile.close() target_edu_dic = {} #Read Target File and Fuzzy Wuzzy infile = open('1996-2006.csv') outfile = open('1996-2006_edu_test.csv', 'w') #infile = open('2007-2017.csv') #outfile = open('2007-2017_edu_test.csv', 'w') for n, orgline in enumerate(infile): if n == 0: continue #if n > 100: # break