Example #1
0
def has_matching_brand_request_query(gear: Gear, q) -> bool:
    brand = gear.make

    if brand in (None, '') or q in (None, ''):
        return False

    similarity = fuzz.partial_ratio(asciidammit(q.lower()), asciidammit(brand.lower()))

    return similarity > 85
def standardize_cuisine(x):
    x = utils.asciidammit(x)
    if 'italian' in x.lower() or 'pizza' in x.lower():
        return 'Pizza/Italian'
    if 'latin' in x.lower():
        return 'Latin'
    if 'cafe' in x.lower() or 'tea' in x.lower() or 'coffee' in x.lower():
        return 'Cafe/Coffee/Tea'
    return x
Example #3
0
def text_normalize(raw):
    """
    Borrow normalization from fuzzywuzzy.
    This uses ascii; should be replaced.
    """
    #make ascii
    araw = fuzzutils.asciidammit(raw)
    #use full process to strip whitespace and lowercase
    fuzzed = fuzzutils.full_process(araw)
    #Replace multiple spaces with single.
    return ' '.join(fuzzed.split())
Example #4
0
def get_brokers():
    broker_raw = map(lambda s: s.strip().split(','),
                     open('realty_broker.csv', 'r').readlines())
    brokers = []
    for row in broker_raw:
        if len(row) < 3:
            continue
        for broker in row[2:]:
            broker = asciidammit(broker.replace('"', '').strip())
            try:
                brokers.append(
                    dict(realty_broker_id=int(row[0]),
                         version=int(row[1]),
                         name=broker))
            except:
                pass
    brokers_df = pd.DataFrame(brokers)
    return brokers_df
Example #5
0
def semi_process(s, force_ascii=False):
    """
    Variation on Fuzzywuzzy's full_process:
    Process string by
    XX removing all but letters and numbers --> These are kept to keep consecutive spans
    -- trim whitespace
    XX force to lower case --> These are kept since annotators marked verbatim spans, so case is a good signal
    if force_ascii == True, force convert to ascii
    """

    if s is None:
        return ""

    if force_ascii:
        s = asciidammit(s)
    # Remove leading and trailing whitespaces.
    string_out = StringProcessor.strip(s)
    return string_out
Example #6
0
def get_brokers():
    broker_raw = map(
        lambda s: s.strip().split(','),
        open('realty_broker.csv', 'r').readlines()
    )
    brokers = []
    for row in broker_raw:
        if len(row) < 3:
            continue
        for broker in row[2:]:
            broker = asciidammit(broker.replace('"', '').strip())
            try:
                brokers.append(
                    dict(realty_broker_id=int(row[0]),
                         version=int(row[1]),
                         name=broker))
            except:
                pass
    brokers_df = pd.DataFrame(brokers)
    return brokers_df
Example #7
0
 def _process_and_sort(self, s, token_set=False):
     '''
     Process and sort 's' in two modes:
         - For both modes: split 's' by non-alphanumeric chars into 'list_s'.
         1) if token_set is True, sort tokens and return sring of sorted tokens.
         2) if token_set is False, if 'list_s' contatins more than 1 term, return two strings:
         joined 'list_s' and joined sorted 'list_s'. Otherwise, simply return 's'.
     '''
     if self.force_ascii:
         s = utils.asciidammit(s)
     s = s.lower()
     if token_set:
         s = re.sub('[^0-9a-zA-Z]+', '', s)
         return ''.join(sorted(list(s)))
     s = re.sub('[^0-9a-zA-Z]+', ' ', s)
     list_s = s.split(' ')
     if len(list_s) > 1:
         if ''.join(sorted(list_s)) != ''.join(list_s):
             return [''.join(list_s), ''.join(sorted(list_s))]
         else:
             return [''.join(list_s)]
     else:
         return [list_s[0]]
Example #8
0
 def test_asciionly(self):
     for s in self.mixed_strings:
         # ascii only only runs on strings
         s = utils.asciidammit(s)
         utils.asciionly(s)
Example #9
0
 def test_asciidammit(self):
     for s in self.mixed_strings:
         utils.asciidammit(s)
Example #10
0
 def _process(self, s, force_ascii=True):
     if force_ascii:
         s = utils.asciidammit(s)
     s = s.lower()
     s = re.sub('[^0-9a-zA-Z]+', '', s)
     return s
Example #11
0
def token_clean(s1, force_ascii=True, full_process=True):
    sorted1 = process_and_sort(s1, force_ascii, full_process=full_process)
    sorted1 = utils.asciidammit(sorted1)
    return sorted1
Example #12
0
 def test_asciionly(self):
     for s in self.mixed_strings:
         # ascii only only runs on strings
         s = utils.asciidammit(s)
         utils.asciionly(s)
Example #13
0
 def test_asciidammit(self):
     for s in self.mixed_strings:
         utils.asciidammit(s)
# Could be a smarter script
# Run as `python match_existing.py [Path to tsv with names in first column]`
###############################################################################

import csv, json
import os, sys
import requests
from fuzzywuzzy import fuzz
from fuzzywuzzy import utils
import config

__USER = config.__USER
__PASS = config.__PASS
entities_all = 'https://catalog.interferencearchive.org/admin/service.php/find/ca_entities?q=*'

infile = sys.argv[1]

if __name__ == '__main__':

	catalog_entities = json.loads(requests.get((entities_all), auth=(__USER, __PASS)).content)['results']
	catalog_data = {i['display_label']:i['entity_id'] for i in catalog_entities}
	with open(infile, 'r') as f:
		rows = csv.reader(f, delimiter='\t')
		rows.next()
		for row in rows:
			for k,v in catalog_data.items():
				iw = utils.asciidammit(row[0])
				score = fuzz.ratio(iw, k)
				if score > 75:
					print iw, "-----", k, v
def change_name(x):
    try:
        return utils.asciidammit(x)
    except:
        print x
        return x