def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None): """Geocode address using Google's API and return dictionary of useful fields address: what to pass to geocode API delay: how long to delay between API requests read_cache: whether to load content from cache when exists num_retries: the number of times to try downloading language: the language to set """ try: address = address.encode('utf-8') except UnicodeDecodeError: common.logger.warning('Geocode failed to parse address and needed to cast to ascii: ' + address) address = common.to_ascii(address) geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (urllib.quote_plus(address), '&language=' + language if language else '') geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries) geocode_data = self.load_result(geocode_url, geocode_html) for result in geocode_data.get('results', []): return self.parse_location(result) return collections.defaultdict(str)
def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None): """Geocode address using Google's API and return dictionary of useful fields address: what to pass to geocode API delay: how long to delay between API requests read_cache: whether to load content from cache when exists num_retries: the number of times to try downloading language: the language to set """ try: address = address.encode('utf-8') except UnicodeDecodeError: common.logger.debug('Geocode failed to parse address and needed to cast to ascii: ' + address) address = common.to_ascii(address) address = re.sub('%C2%9\d', '', urllib.quote_plus(address)) geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (address, '&language=' + language if language else '') try: # legacy data without api key geocode_html = self.D.cache[geocode_url] if geocode_html: self.D.response_code = '200' else: raise KeyError() except KeyError: geocode_url = 'https://maps.google.com/maps/api/geocode/json?address=%s&key=%s&sensor=false%s' % (address, api_key or '', '&language=' + language if language else '') geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries) geocode_data = self.load_result(geocode_url, geocode_html) for result in geocode_data.get('results', []): return self.parse_location(result) return collections.defaultdict(str)
def sort_key(self): """this value should assign the name its proper place in the alfabet """ base = self.guess_normal_form() base = base.replace(',', '') base = base.replace(' ', ' ') base = base.strip() base = base.lower() ignore_these = '()' for c in ignore_these: base = base.replace(c, '') for s in PREFIXES: #we also strip prefixes (so "L'Eremite, B." sorts between "Eremite, A" and "Eremite, C") if base.startswith(s): base = base[len(s):] ls = base.split() for s in TUSSENVOEGSELS: if ls and ls[0] == s: base = ' '.join(ls[1:]) for s in '?.-': #if the name starts with any of these characters, it comes last (not first) if base.startswith(s): base = chr(126) + base base = to_ascii(base) base = base.strip() base = base[:40] return base
def __init__( self, ident=None, name=None, cou=None, birth_date=None, lefty: Optional[bool] = None, disp_names=None, ): self.ident = ident self.name = co.to_ascii(name.strip()) if name is not None else None self.cou = co.to_ascii(cou.strip()) if cou else None self.rating = ratings.Rating() self.birth_date = birth_date self.lefty: Optional[bool] = lefty self.disp_names = disp_names
def clean_content(self, html, max_size, force_html, force_ascii): """Clean up downloaded content """ if max_size is not None and len(html) > max_size: common.logger.info('Too big: %s' % len(html)) html = '' # too big to store elif force_html and not common.is_html(html): common.logger.info('Not html') html = '' # non-html content elif force_ascii: html = common.to_ascii(html) # remove non-ascii characters return html
def get_money(text): """ извлечение денежной суммы (с помощью regexpr) """ if not text: return None raw_text = co.to_ascii(text.replace(",", ".").strip()) match = MONEY_RE.match(raw_text) if match: value = float(match.group("value")) if match.group("multier") == "K": return value * 1000 elif match.group("multier") == "M": return value * 1000000 return None
def soundex_nl(s, length=4, group=1, wildcards=False): """ return a string of length representing a phonetical canonical form of s stab at giving names a simplified canonical form based on Dutch phonetics and spelling conventions arguments: s : a string length : an integer. Length=-1 transforms the whole string group : an integer [1, 2] wildcards : if True, wildcard element (?, *) remain in place There are two groups: - group 1: identify lots - groep 2: identify somewhat less (stay close to actual phonetics) """ #ignore Romans if s in ROMANS_frozenset: return s s = s.lower() s = to_ascii(s) if not wildcards: #remove 'wildcard' characters s = s.replace('*', '').replace('?', '') #strip off certain prefixes #XXX this should be in the regular expression specs # for x in PREFIXES: # if s.startswith(x): # s = s[len(x):] if group == 1: groups = GROUPS1 elif group == 2: groups = GROUPS2 else: raise Exception('"group" argument must be either 1 or 2') s = apply_regexps(s, groups) if s.endswith('.'): s = s[:-1] if not s: s = u'.' if length > 0: s = s[:length] s = unicode(s) return s
def from_oncourt(raw_name): from common import to_ascii, split_ontwo raw = to_ascii(raw_name) desc_name, name = split_ontwo(raw, delim=" - ") if name == "": name = raw desc_name = "" number = None if name and name[-1].isdigit() and name[-2] == " ": number = int(name[-1]) if number == 1: number = None # to compare with flashscore 1-less name name = name[:-2] return TourName(name=name, desc_name=desc_name, number=number)
def get_name_cou(is_left): inclass_txt = "event__participant--{}".format( "home" if is_left else "away") xpath_txt = "child::div[contains(@class, '{}')]".format(inclass_txt) try: plr_el = co.find_first_xpath(element, xpath_txt) if plr_el is None: raise co.TennisParseError("nofound plr_el in \n{}".format( etree.tostring(element, encoding="utf8"))) txt = co.to_ascii(plr_el.text).strip() bracket_idx = txt.index("(") name = txt[0:bracket_idx].strip() cou = txt[bracket_idx + 1:bracket_idx + 4].strip().upper() return name, cou except ValueError: # maybe it is team's result div without '(cou)': tourinfo_cache.edit_skip(live_event.tour_info)
def __init__(self, name): name_low = co.to_ascii(name.strip()).lower() if ("carpet" in name_low and "outdoor" not in name_low) or ( "hard" in name_low and ("indoor" in name_low or name_low.startswith("i."))): self.name = "Carpet" elif "grass" in name_low: self.name = "Grass" elif "clay" in name_low: self.name = "Clay" elif "hard" in name_low or ("carpet" in name_low and "outdoor" in name_low): self.name = "Hard" elif "acrylic" in name_low: self.name = "Acrylic" else: raise co.TennisSurfaceError("unexpected surface '{}'".format(name))
def _clean_content(self, html, max_size, force_html, force_ascii): """Clean up downloaded content html: the input to clean max_size: the maximum size of data allowed force_html: content must be HTML force_ascii: content must be ASCII """ if max_size is not None and len(html) > max_size: common.logger.info('Webpage is too big: %s' % len(html)) html = '' # too big to store elif force_html and not common.is_html(html): common.logger.info('Webpage is not html') html = '' # non-html content elif force_ascii: html = common.to_ascii(html) # remove non-ascii characters return html
def get(self, url, **kwargs): """Download this URL and return the HTML. Data is cached so only have to download once. url is what to download kwargs can override any of the arguments passed to constructor """ delay = kwargs.get("delay", self.delay) proxy = kwargs.get("proxy", self.proxy) user_agent = kwargs.get("user_agent", self.user_agent) opener = kwargs.get("opener", self.opener) headers = kwargs.get("headers", self.headers) data = kwargs.get("data", self.data) use_cache = kwargs.get("use_cache", self.use_cache) use_remote = kwargs.get("use_remote", self.use_remote) retry = kwargs.get("retry", self.retry) force_html = kwargs.get("force_html", self.force_html) force_ascii = kwargs.get("force_ascii", self.force_ascii) max_size = kwargs.get("max_size", self.max_size) if use_cache and url in self.cache: html = self.cache[url] if retry and not html: print "Redownloading" else: return html if not use_remote: return "" # do not try downloading but return empty print url # need to download url self.domain_delay(url, delay=delay, proxy=proxy) # crawl slowly for each domain to reduce risk of being blocked html = self.fetch(url, headers=headers, data=data) if max_size is not None and len(html) > max_size: html = "" # too big to store elif force_html and not re.search("html|head|body", html): html = "" # non-html content elif force_ascii: html = to_ascii(html) # remove non-ascii characters self.cache[url] = html return html
def __init__(self, value: str, details=None): self.value = co.to_ascii(value.strip()) self._detailing = details assert self.value in Round.names, "unexpected rnd value: {}".format( self.value)
def soundex_nl(s, length=4, group=1): """ stab at giving names a simplified canonical form De truuk is een vorm te vinden zodat zoveel mogelijk 'gelijkvormige' namen dezelfde "soundex" krijgen... Er zijn verschillende "groepen" - groep 1: identificeer zoveel mogelijk - groep 2: identificeer wat minder lenght geeft the lengte van het resultaat aan. length = -1 geeft de hele string """ s = s.lower() s = to_ascii(s) #strip of certain suffixes # for suffix, replacement in [ # ('en', ''), # ('sch', 's'), # ]: # if s.endswith(suffix): # s = s[:-len(suffix)] + replacement # break #strip of certain prefixes for x in PREFIXES + ['h']: if s.startswith(x): s = s[len(x):] if group == 1: groups = GROUPS1 elif group == 2: groups = GROUPS2 else: raise Exception('"group" argument must be either 1 or 2') #THIS DOES LOST OF UNNCECESSARY STUFF if 1: #create a single regular expression for k, regexp in groups: # regexp = re.compile('|'.join(ls)) s = regexp.sub(k, s) while regexp.search(s): s = regexp.sub(k, s) if 0: ls = [] d = {} for k, l in groups: ls += l for x in l: d[x] = k s = multiple_replace(d, s) if 0: ls = [] d = {} for k, l in groups: ls += l for x in l: d[x] = k to_replace = [] for x in ls: if re.search(x, s): to_replace.append(x) for x in to_replace: s = re.sub(x, d[x], s) if s.endswith('.'): s = s[:-1] if not s: s = '.' if length > 0: s = s[:length] s = str(s) return s
def geslachtsnaam_soundex(self): return self.soundex_nl( to_ascii(self.geslachtsnaam()), group=2, length=-1 )
def __init__(self, name): self.name = co.to_ascii(name) self.parts = [] # lower case syllables self.syllabled = "" # lower case syllables departed by '-' self._init_parts()
def ratio(self, n1, n2, explain=0): """Combine several parameters do find a similarity ratio""" weight_normal_form = 3.0 #distance between soundexes of normal form weight_normal_form_if_one_name_is_in_initials = old_div( weight_normal_form, 4) #distance between soundexes of normal form weight_normal_form_soundex = 9.0 #average distance between soundexes of normal form weight_normal_form_soundex_if_one_name_is_in_initials = old_div( weight_normal_form_soundex, 4) #distance between soundexes of normal form weight_geslachtsnaam1 = 7.0 #distance between soundexes of geslachtsnamen weight_geslachtsnaam2 = 7.0 #distance between geslachtsnaam weight_initials = 2 #distance between initials weight_initials_if_one_name_is_in_initials = weight_initials * 2 #distance between initials if one of the names is in intials #(for example, "A.B Classen") ### NORMAL FORM nf1 = n1.guess_normal_form() nf2 = n2.guess_normal_form() #remove diacritics nf1 = to_ascii(nf1) nf2 = to_ascii(nf2) # ratio_normal_form = self.levenshtein_ratio(nf1, nf2) ratio_normal_form = self.average_distance(split(nf1), split(nf2)) #create a simkplified soundex set for this name #remove stopwords nf1 = remove_stopwords(nf1) nf2 = remove_stopwords(nf2) #l# = min(len(nf1.split()), len(nf2.split())) #nf1 = ' '.join(nf1.split()[:l]) #nf2 = ' '.join(nf2.split()[:l]) #we use the soundex_nl property of the name, so the property gets cached se1 = n1.soundex_nl(nf1, group=2, length=-1) se2 = n2.soundex_nl(nf2, group=2, length=-1) #make the nfs of the same length ratio_normal_form_soundex = self.average_distance(se1, se2) #gelachtsnaam wordt op twee manieren met elkaar vergeleken #de soundexes van de achternaam worden meegewoen g1 = n1.geslachtsnaam() #or n1.get_volledige_naam() g2 = n2.geslachtsnaam() #or n2.get_volledige_naam() g1 = to_ascii(g1) g2 = to_ascii(g2) g1_soundex = n1.soundex_nl(g1, group=2, length=-1) g2_soundex = n2.soundex_nl(g2, group=2, length=-1) ratio_geslachtsnaam1 = self.average_distance(g1_soundex, g2_soundex) #n de afstand van de woorden in de achtenraam zelf ratio_geslachtsnaam2 = self.average_distance( re.split('[ \.\,\-]', g1.lower()), re.split('[ \.\,\-]', g2.lower()), self.levenshtein_ratio) #count initials only if we have more than one #(or perhaps make this: if we know the first name) if len(n1.initials()) == 1 or len(n2.initials()) == 1: #initials count much less if there is only one weight_initials = 0 ratio_initials = .5 elif n1.contains_initials() or n2.contains_initials(): ratio_initials = self.levenshtein_ratio(n1.initials().lower(), n2.initials().lower()) weight_initials = weight_initials_if_one_name_is_in_initials elif len(n1.initials()) > 1 and len(n2.initials()) > 1: ratio_initials = self.levenshtein_ratio(n1.initials().lower(), n2.initials().lower()) else: ratio_initials = 0.7 if n1.contains_initials() or n2.contains_initials(): weight_normal_form = weight_normal_form_if_one_name_is_in_initials weight_normal_form_soundex = weight_normal_form_soundex_if_one_name_is_in_initials try: teller = ratio_normal_form * weight_normal_form + ratio_normal_form_soundex * weight_normal_form_soundex + ratio_geslachtsnaam1 * weight_geslachtsnaam1 + ratio_geslachtsnaam2 * weight_geslachtsnaam2 + ratio_initials * weight_initials noemer = weight_normal_form + weight_normal_form_soundex + weight_initials + weight_geslachtsnaam1 + weight_geslachtsnaam2 final_ratio = old_div(teller, noemer) except ZeroDivisionError: return 0.0 if explain: d = [ ( 'ratio_normal_form', ratio_normal_form, ), ( 'weight_normal_form', weight_normal_form, ), ( 'ratio_geslachtsnaam1 (soundex)', ratio_geslachtsnaam1, ), ( 'weight_geslachtsnaam1', weight_geslachtsnaam1, ), ( 'ratio_geslachtsnaam2 (letterlijke geslachtsnaam)', ratio_geslachtsnaam2, ), ( 'weight_geslachtsnaam2', weight_geslachtsnaam2, ), ( 'ratio_initials', ratio_initials, ), ( 'weight_initials', weight_initials, ), ( 'final_ratio', final_ratio, ), ( 'teller', teller, ), ( 'noemer', noemer, ), ] s = '-' * 100 + '\n' s += 'Naam1: %s [%s] [%s] %s\n' % (n1, n1.initials(), n1.guess_normal_form(), se1) s += 'Naam2: %s [%s] [%s] %s\n' % (n2, n2.initials(), n2.guess_normal_form(), se2) s += 'Similarity ratio: %s\n' % final_ratio s += '--- REASONS' + '-' * 30 + '\n' format_s = '%-30s | %-10s | %-10s | %-10s | %-10s | %s-10s\n' s += format_s % ('\t property', ' ratio', ' weight', 'relative_weight', ' r*w', 'r * relative_w') s += '\t' + '-' * 100 + '\n' format_s = '\t%-30s | %-10f | %-10f | %-10f | %-10f | %-10f\n' s += format_s % ( ' normal_form', ratio_normal_form, weight_normal_form, old_div(weight_normal_form, teller), ratio_normal_form * weight_normal_form, old_div(ratio_normal_form * weight_normal_form, teller)) s += format_s % ( 'soundex van normal_form', ratio_normal_form_soundex, weight_normal_form_soundex, old_div(weight_normal_form_soundex, teller), ratio_normal_form_soundex * weight_normal_form_soundex, old_div(ratio_normal_form_soundex * weight_normal_form_soundex, teller)) s += format_s % ( 'soundex van geslachtsnaam1', ratio_geslachtsnaam1, weight_geslachtsnaam1, old_div(weight_geslachtsnaam1, teller), ratio_geslachtsnaam1 * weight_geslachtsnaam1, old_div(ratio_geslachtsnaam1 * weight_geslachtsnaam1, teller)) s += format_s % ( 'geslachtsnaam', ratio_geslachtsnaam2, weight_geslachtsnaam2, old_div(weight_geslachtsnaam2, teller), ratio_geslachtsnaam2 * weight_geslachtsnaam2, old_div(ratio_geslachtsnaam2 * weight_geslachtsnaam2, teller)) s += format_s % ('initials', ratio_initials, weight_initials, old_div(weight_initials, teller), ratio_initials * weight_initials, old_div(ratio_initials * weight_initials, teller)) s += '\tTOTAL (numerator) | %s (counter = %s)\n' % ( teller, noemer) return s return '\n'.join(['%s: %s' % (k, v) for k, v in d]) return final_ratio