def has_matching_brand_request_query(gear: Gear, q) -> bool: brand = gear.make if brand in (None, '') or q in (None, ''): return False similarity = fuzz.partial_ratio(asciidammit(q.lower()), asciidammit(brand.lower())) return similarity > 85
def standardize_cuisine(x): x = utils.asciidammit(x) if 'italian' in x.lower() or 'pizza' in x.lower(): return 'Pizza/Italian' if 'latin' in x.lower(): return 'Latin' if 'cafe' in x.lower() or 'tea' in x.lower() or 'coffee' in x.lower(): return 'Cafe/Coffee/Tea' return x
def text_normalize(raw): """ Borrow normalization from fuzzywuzzy. This uses ascii; should be replaced. """ #make ascii araw = fuzzutils.asciidammit(raw) #use full process to strip whitespace and lowercase fuzzed = fuzzutils.full_process(araw) #Replace multiple spaces with single. return ' '.join(fuzzed.split())
def get_brokers(): broker_raw = map(lambda s: s.strip().split(','), open('realty_broker.csv', 'r').readlines()) brokers = [] for row in broker_raw: if len(row) < 3: continue for broker in row[2:]: broker = asciidammit(broker.replace('"', '').strip()) try: brokers.append( dict(realty_broker_id=int(row[0]), version=int(row[1]), name=broker)) except: pass brokers_df = pd.DataFrame(brokers) return brokers_df
def semi_process(s, force_ascii=False): """ Variation on Fuzzywuzzy's full_process: Process string by XX removing all but letters and numbers --> These are kept to keep consecutive spans -- trim whitespace XX force to lower case --> These are kept since annotators marked verbatim spans, so case is a good signal if force_ascii == True, force convert to ascii """ if s is None: return "" if force_ascii: s = asciidammit(s) # Remove leading and trailing whitespaces. string_out = StringProcessor.strip(s) return string_out
def get_brokers(): broker_raw = map( lambda s: s.strip().split(','), open('realty_broker.csv', 'r').readlines() ) brokers = [] for row in broker_raw: if len(row) < 3: continue for broker in row[2:]: broker = asciidammit(broker.replace('"', '').strip()) try: brokers.append( dict(realty_broker_id=int(row[0]), version=int(row[1]), name=broker)) except: pass brokers_df = pd.DataFrame(brokers) return brokers_df
def _process_and_sort(self, s, token_set=False): ''' Process and sort 's' in two modes: - For both modes: split 's' by non-alphanumeric chars into 'list_s'. 1) if token_set is True, sort tokens and return sring of sorted tokens. 2) if token_set is False, if 'list_s' contatins more than 1 term, return two strings: joined 'list_s' and joined sorted 'list_s'. Otherwise, simply return 's'. ''' if self.force_ascii: s = utils.asciidammit(s) s = s.lower() if token_set: s = re.sub('[^0-9a-zA-Z]+', '', s) return ''.join(sorted(list(s))) s = re.sub('[^0-9a-zA-Z]+', ' ', s) list_s = s.split(' ') if len(list_s) > 1: if ''.join(sorted(list_s)) != ''.join(list_s): return [''.join(list_s), ''.join(sorted(list_s))] else: return [''.join(list_s)] else: return [list_s[0]]
def test_asciionly(self): for s in self.mixed_strings: # ascii only only runs on strings s = utils.asciidammit(s) utils.asciionly(s)
def test_asciidammit(self): for s in self.mixed_strings: utils.asciidammit(s)
def _process(self, s, force_ascii=True): if force_ascii: s = utils.asciidammit(s) s = s.lower() s = re.sub('[^0-9a-zA-Z]+', '', s) return s
def token_clean(s1, force_ascii=True, full_process=True): sorted1 = process_and_sort(s1, force_ascii, full_process=full_process) sorted1 = utils.asciidammit(sorted1) return sorted1
# Could be a smarter script # Run as `python match_existing.py [Path to tsv with names in first column]` ############################################################################### import csv, json import os, sys import requests from fuzzywuzzy import fuzz from fuzzywuzzy import utils import config __USER = config.__USER __PASS = config.__PASS entities_all = 'https://catalog.interferencearchive.org/admin/service.php/find/ca_entities?q=*' infile = sys.argv[1] if __name__ == '__main__': catalog_entities = json.loads(requests.get((entities_all), auth=(__USER, __PASS)).content)['results'] catalog_data = {i['display_label']:i['entity_id'] for i in catalog_entities} with open(infile, 'r') as f: rows = csv.reader(f, delimiter='\t') rows.next() for row in rows: for k,v in catalog_data.items(): iw = utils.asciidammit(row[0]) score = fuzz.ratio(iw, k) if score > 75: print iw, "-----", k, v
def change_name(x): try: return utils.asciidammit(x) except: print x return x