def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None):
        """Geocode address using Google's API and return dictionary of useful fields

        address:
            what to pass to geocode API
        delay:
            how long to delay between API requests
        read_cache:
            whether to load content from cache when exists
        num_retries:
            the number of times to try downloading
        language:
            the language to set
        """
        try:
            address = address.encode('utf-8')
        except UnicodeDecodeError:
            common.logger.warning('Geocode failed to parse address and needed to cast to ascii: ' + address)
            address = common.to_ascii(address)
        geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (urllib.quote_plus(address), '&language=' + language if language else '')
        geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries)
        geocode_data = self.load_result(geocode_url, geocode_html)
        for result in geocode_data.get('results', []):
            return self.parse_location(result)
        return collections.defaultdict(str)
Example #2
0
    def geocode(self, address, delay=5, read_cache=True, num_retries=1, language=None, api_key=None):
        """Geocode address using Google's API and return dictionary of useful fields

        address:
            what to pass to geocode API
        delay:
            how long to delay between API requests
        read_cache:
            whether to load content from cache when exists
        num_retries:
            the number of times to try downloading
        language:
            the language to set
        """
        try:
            address = address.encode('utf-8')
        except UnicodeDecodeError:
            common.logger.debug('Geocode failed to parse address and needed to cast to ascii: ' + address)
            address = common.to_ascii(address)
        address = re.sub('%C2%9\d', '', urllib.quote_plus(address))
        geocode_url = 'http://maps.google.com/maps/api/geocode/json?address=%s&sensor=false%s' % (address, '&language=' + language if language else '')
        try:
            # legacy data without api key
            geocode_html = self.D.cache[geocode_url]
            if geocode_html:
                self.D.response_code = '200'
            else:
                raise KeyError()
        except KeyError:
            geocode_url = 'https://maps.google.com/maps/api/geocode/json?address=%s&key=%s&sensor=false%s' % (address, api_key or '', '&language=' + language if language else '')
            geocode_html = self.D.get(geocode_url, delay=delay, read_cache=read_cache, num_retries=num_retries)
        geocode_data = self.load_result(geocode_url, geocode_html)
        for result in geocode_data.get('results', []):
            return self.parse_location(result)
        return collections.defaultdict(str)
Example #3
0
    def sort_key(self):
        """this value should assign the name its proper place in the alfabet
        """
        base = self.guess_normal_form()
        base = base.replace(',', '')
        base = base.replace('  ', ' ')
        base = base.strip()
        base = base.lower()
        ignore_these = '()'
        for c in ignore_these:
            base = base.replace(c, '')
            
        for s in PREFIXES: #we also strip prefixes (so "L'Eremite, B." sorts between "Eremite, A" and "Eremite, C")
            if base.startswith(s):
                base = base[len(s):]

        ls = base.split()
        for s in TUSSENVOEGSELS:
            if ls and ls[0] == s:
                base = ' '.join(ls[1:])
                
        for s in '?.-': #if the name starts with any of these characters, it comes last (not first)
            if base.startswith(s):
                base = chr(126) + base 
        
        base = to_ascii(base) 
        base = base.strip()
        base = base[:40]
        return base 
Example #4
0
 def __init__(
     self,
     ident=None,
     name=None,
     cou=None,
     birth_date=None,
     lefty: Optional[bool] = None,
     disp_names=None,
 ):
     self.ident = ident
     self.name = co.to_ascii(name.strip()) if name is not None else None
     self.cou = co.to_ascii(cou.strip()) if cou else None
     self.rating = ratings.Rating()
     self.birth_date = birth_date
     self.lefty: Optional[bool] = lefty
     self.disp_names = disp_names
Example #5
0
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = '' # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = '' # non-html content
     elif force_ascii:
         html = common.to_ascii(html) # remove non-ascii characters
     return html
Example #6
0
 def clean_content(self, html, max_size, force_html, force_ascii):
     """Clean up downloaded content
     """
     if max_size is not None and len(html) > max_size:
         common.logger.info('Too big: %s' % len(html))
         html = ''  # too big to store
     elif force_html and not common.is_html(html):
         common.logger.info('Not html')
         html = ''  # non-html content
     elif force_ascii:
         html = common.to_ascii(html)  # remove non-ascii characters
     return html
Example #7
0
def get_money(text):
    """ извлечение денежной суммы (с помощью regexpr) """
    if not text:
        return None
    raw_text = co.to_ascii(text.replace(",", ".").strip())
    match = MONEY_RE.match(raw_text)
    if match:
        value = float(match.group("value"))
        if match.group("multier") == "K":
            return value * 1000
        elif match.group("multier") == "M":
            return value * 1000000
    return None
Example #8
0
def soundex_nl(s, length=4, group=1, wildcards=False):
    """
    return a string of length representing a phonetical canonical form of s
    stab at giving names a simplified canonical form based on Dutch phonetics and spelling conventions
    
    arguments:
        s : a string
        length : an integer. Length=-1 transforms the whole string
        group : an integer [1, 2]
        wildcards : if True, wildcard element (?, *) remain in place
    
    There are two groups:
        - group 1: identify lots
        - groep 2: identify somewhat less (stay close to actual phonetics)
        
    """
    #ignore Romans
    if s in ROMANS_frozenset:
        return s
    
    s = s.lower()
    s = to_ascii(s)
    if not wildcards:
        #remove 'wildcard' characters
        s = s.replace('*', '').replace('?', '')

    #strip off certain prefixes
    #XXX this should be in the regular expression specs
#    for x in PREFIXES:
#        if s.startswith(x):
#            s = s[len(x):]
    if group == 1:
        groups = GROUPS1
    elif group == 2:
        groups = GROUPS2 
    else:
        raise Exception('"group" argument must be either 1 or 2')

    s = apply_regexps(s, groups)

    if s.endswith('.'):
        s = s[:-1]
    if not s: 
        s = u'.'
    if length > 0:
        s = s[:length]
        
    s = unicode(s)
    return s
Example #9
0
    def from_oncourt(raw_name):
        from common import to_ascii, split_ontwo

        raw = to_ascii(raw_name)
        desc_name, name = split_ontwo(raw, delim=" - ")
        if name == "":
            name = raw
            desc_name = ""
        number = None
        if name and name[-1].isdigit() and name[-2] == " ":
            number = int(name[-1])
            if number == 1:
                number = None  # to compare with flashscore 1-less name
            name = name[:-2]
        return TourName(name=name, desc_name=desc_name, number=number)
Example #10
0
 def get_name_cou(is_left):
     inclass_txt = "event__participant--{}".format(
         "home" if is_left else "away")
     xpath_txt = "child::div[contains(@class, '{}')]".format(inclass_txt)
     try:
         plr_el = co.find_first_xpath(element, xpath_txt)
         if plr_el is None:
             raise co.TennisParseError("nofound plr_el in \n{}".format(
                 etree.tostring(element, encoding="utf8")))
         txt = co.to_ascii(plr_el.text).strip()
         bracket_idx = txt.index("(")
         name = txt[0:bracket_idx].strip()
         cou = txt[bracket_idx + 1:bracket_idx + 4].strip().upper()
         return name, cou
     except ValueError:
         # maybe it is team's result div without '(cou)':
         tourinfo_cache.edit_skip(live_event.tour_info)
Example #11
0
 def __init__(self, name):
     name_low = co.to_ascii(name.strip()).lower()
     if ("carpet" in name_low and "outdoor" not in name_low) or (
             "hard" in name_low and
         ("indoor" in name_low or name_low.startswith("i."))):
         self.name = "Carpet"
     elif "grass" in name_low:
         self.name = "Grass"
     elif "clay" in name_low:
         self.name = "Clay"
     elif "hard" in name_low or ("carpet" in name_low
                                 and "outdoor" in name_low):
         self.name = "Hard"
     elif "acrylic" in name_low:
         self.name = "Acrylic"
     else:
         raise co.TennisSurfaceError("unexpected surface '{}'".format(name))
Example #12
0
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = ''  # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = ''  # non-html content
        elif force_ascii:
            html = common.to_ascii(html)  # remove non-ascii characters
        return html
Example #13
0
    def _clean_content(self, html, max_size, force_html, force_ascii):
        """Clean up downloaded content

        html:
            the input to clean
        max_size:
            the maximum size of data allowed
        force_html:
            content must be HTML
        force_ascii:
            content must be ASCII
        """
        if max_size is not None and len(html) > max_size:
            common.logger.info('Webpage is too big: %s' % len(html))
            html = '' # too big to store
        elif force_html and not common.is_html(html):
            common.logger.info('Webpage is not html')
            html = '' # non-html content
        elif force_ascii:
            html = common.to_ascii(html) # remove non-ascii characters
        return html
Example #14
0
    def get(self, url, **kwargs):
        """Download this URL and return the HTML. Data is cached so only have to download once.

        url is what to download
        kwargs can override any of the arguments passed to constructor
        """
        delay = kwargs.get("delay", self.delay)
        proxy = kwargs.get("proxy", self.proxy)
        user_agent = kwargs.get("user_agent", self.user_agent)
        opener = kwargs.get("opener", self.opener)
        headers = kwargs.get("headers", self.headers)
        data = kwargs.get("data", self.data)
        use_cache = kwargs.get("use_cache", self.use_cache)
        use_remote = kwargs.get("use_remote", self.use_remote)
        retry = kwargs.get("retry", self.retry)
        force_html = kwargs.get("force_html", self.force_html)
        force_ascii = kwargs.get("force_ascii", self.force_ascii)
        max_size = kwargs.get("max_size", self.max_size)

        if use_cache and url in self.cache:
            html = self.cache[url]
            if retry and not html:
                print "Redownloading"
            else:
                return html
        if not use_remote:
            return ""  # do not try downloading but return empty

        print url  # need to download url
        self.domain_delay(url, delay=delay, proxy=proxy)  # crawl slowly for each domain to reduce risk of being blocked
        html = self.fetch(url, headers=headers, data=data)
        if max_size is not None and len(html) > max_size:
            html = ""  # too big to store
        elif force_html and not re.search("html|head|body", html):
            html = ""  # non-html content
        elif force_ascii:
            html = to_ascii(html)  # remove non-ascii characters
        self.cache[url] = html
        return html
Example #15
0
 def __init__(self, value: str, details=None):
     self.value = co.to_ascii(value.strip())
     self._detailing = details
     assert self.value in Round.names, "unexpected rnd value: {}".format(
         self.value)
Example #16
0
def soundex_nl(s, length=4, group=1):
    """
    stab at giving names a simplified canonical form

    De truuk is een vorm te vinden zodat zoveel mogelijk 'gelijkvormige' namen
    dezelfde "soundex" krijgen...

    Er zijn verschillende "groepen"
        - groep 1: identificeer zoveel mogelijk
        - groep 2: identificeer wat minder

    lenght geeft the lengte van het resultaat aan. length = -1 geeft de hele string
    """
    s = s.lower()
    s = to_ascii(s)

    #strip of certain suffixes
    #    for suffix, replacement in [
    #        ('en', ''),
    #        ('sch', 's'),
    #        ]:
    #        if s.endswith(suffix):
    #            s = s[:-len(suffix)] + replacement
    #            break

    #strip of certain prefixes
    for x in PREFIXES + ['h']:
        if s.startswith(x):
            s = s[len(x):]

    if group == 1:
        groups = GROUPS1
    elif group == 2:
        groups = GROUPS2

    else:
        raise Exception('"group" argument must be either 1 or 2')
    #THIS DOES LOST OF UNNCECESSARY STUFF

    if 1:
        #create a single regular expression
        for k, regexp in groups:
            #            regexp = re.compile('|'.join(ls))
            s = regexp.sub(k, s)
            while regexp.search(s):
                s = regexp.sub(k, s)
    if 0:
        ls = []
        d = {}
        for k, l in groups:
            ls += l
            for x in l:
                d[x] = k
        s = multiple_replace(d, s)
    if 0:
        ls = []
        d = {}
        for k, l in groups:
            ls += l
            for x in l:
                d[x] = k

        to_replace = []
        for x in ls:
            if re.search(x, s):
                to_replace.append(x)
        for x in to_replace:
            s = re.sub(x, d[x], s)
    if s.endswith('.'):
        s = s[:-1]
    if not s:
        s = '.'
    if length > 0:
        s = s[:length]
    s = str(s)
    return s
Example #17
0
 def geslachtsnaam_soundex(self):
     return self.soundex_nl(
         to_ascii(self.geslachtsnaam()),
         group=2, length=-1
     )
Example #18
0
 def __init__(self, name):
     self.name = co.to_ascii(name)
     self.parts = []  # lower case syllables
     self.syllabled = ""  # lower case syllables departed by '-'
     self._init_parts()
Example #19
0
    def ratio(self, n1, n2, explain=0):
        """Combine several parameters do find a similarity ratio"""

        weight_normal_form = 3.0  #distance between soundexes of normal form
        weight_normal_form_if_one_name_is_in_initials = old_div(
            weight_normal_form, 4)  #distance between soundexes of normal form
        weight_normal_form_soundex = 9.0  #average distance between soundexes of normal form
        weight_normal_form_soundex_if_one_name_is_in_initials = old_div(
            weight_normal_form_soundex,
            4)  #distance between soundexes of normal form
        weight_geslachtsnaam1 = 7.0  #distance between soundexes of geslachtsnamen
        weight_geslachtsnaam2 = 7.0  #distance between geslachtsnaam
        weight_initials = 2  #distance between initials
        weight_initials_if_one_name_is_in_initials = weight_initials * 2  #distance between initials if one of the names is in intials
        #(for example, "A.B Classen")

        ### NORMAL FORM

        nf1 = n1.guess_normal_form()
        nf2 = n2.guess_normal_form()
        #remove diacritics
        nf1 = to_ascii(nf1)
        nf2 = to_ascii(nf2)

        #        ratio_normal_form = self.levenshtein_ratio(nf1, nf2)
        ratio_normal_form = self.average_distance(split(nf1), split(nf2))
        #create a simkplified soundex set for this name
        #remove stopwords
        nf1 = remove_stopwords(nf1)
        nf2 = remove_stopwords(nf2)

        #l# = min(len(nf1.split()), len(nf2.split()))
        #nf1 = ' '.join(nf1.split()[:l])
        #nf2 = ' '.join(nf2.split()[:l])
        #we use the soundex_nl property of the name, so the property gets cached
        se1 = n1.soundex_nl(nf1, group=2, length=-1)
        se2 = n2.soundex_nl(nf2, group=2, length=-1)
        #make the nfs of the same length

        ratio_normal_form_soundex = self.average_distance(se1, se2)

        #gelachtsnaam wordt op twee manieren met elkaar vergeleken

        #de soundexes van de achternaam worden meegewoen
        g1 = n1.geslachtsnaam()  #or n1.get_volledige_naam()
        g2 = n2.geslachtsnaam()  #or n2.get_volledige_naam()
        g1 = to_ascii(g1)
        g2 = to_ascii(g2)
        g1_soundex = n1.soundex_nl(g1, group=2, length=-1)
        g2_soundex = n2.soundex_nl(g2, group=2, length=-1)
        ratio_geslachtsnaam1 = self.average_distance(g1_soundex, g2_soundex)

        #n de afstand van de woorden in de achtenraam zelf
        ratio_geslachtsnaam2 = self.average_distance(
            re.split('[ \.\,\-]', g1.lower()),
            re.split('[ \.\,\-]', g2.lower()), self.levenshtein_ratio)

        #count initials only if we have more than one
        #(or perhaps make this: if we know the first name)

        if len(n1.initials()) == 1 or len(n2.initials()) == 1:
            #initials count much less if there is only one
            weight_initials = 0
            ratio_initials = .5
        elif n1.contains_initials() or n2.contains_initials():
            ratio_initials = self.levenshtein_ratio(n1.initials().lower(),
                                                    n2.initials().lower())
            weight_initials = weight_initials_if_one_name_is_in_initials
        elif len(n1.initials()) > 1 and len(n2.initials()) > 1:
            ratio_initials = self.levenshtein_ratio(n1.initials().lower(),
                                                    n2.initials().lower())
        else:
            ratio_initials = 0.7

        if n1.contains_initials() or n2.contains_initials():
            weight_normal_form = weight_normal_form_if_one_name_is_in_initials
            weight_normal_form_soundex = weight_normal_form_soundex_if_one_name_is_in_initials

        try:
            teller = ratio_normal_form * weight_normal_form + ratio_normal_form_soundex * weight_normal_form_soundex + ratio_geslachtsnaam1 * weight_geslachtsnaam1 + ratio_geslachtsnaam2 * weight_geslachtsnaam2 + ratio_initials * weight_initials
            noemer = weight_normal_form + weight_normal_form_soundex + weight_initials + weight_geslachtsnaam1 + weight_geslachtsnaam2
            final_ratio = old_div(teller, noemer)

        except ZeroDivisionError:
            return 0.0
        if explain:
            d = [
                (
                    'ratio_normal_form',
                    ratio_normal_form,
                ),
                (
                    'weight_normal_form',
                    weight_normal_form,
                ),
                (
                    'ratio_geslachtsnaam1 (soundex)',
                    ratio_geslachtsnaam1,
                ),
                (
                    'weight_geslachtsnaam1',
                    weight_geslachtsnaam1,
                ),
                (
                    'ratio_geslachtsnaam2 (letterlijke geslachtsnaam)',
                    ratio_geslachtsnaam2,
                ),
                (
                    'weight_geslachtsnaam2',
                    weight_geslachtsnaam2,
                ),
                (
                    'ratio_initials',
                    ratio_initials,
                ),
                (
                    'weight_initials',
                    weight_initials,
                ),
                (
                    'final_ratio',
                    final_ratio,
                ),
                (
                    'teller',
                    teller,
                ),
                (
                    'noemer',
                    noemer,
                ),
            ]
            s = '-' * 100 + '\n'
            s += 'Naam1: %s [%s] [%s] %s\n' % (n1, n1.initials(),
                                               n1.guess_normal_form(), se1)
            s += 'Naam2: %s [%s] [%s] %s\n' % (n2, n2.initials(),
                                               n2.guess_normal_form(), se2)
            s += 'Similarity ratio: %s\n' % final_ratio
            s += '--- REASONS' + '-' * 30 + '\n'
            format_s = '%-30s | %-10s | %-10s | %-10s | %-10s | %s-10s\n'
            s += format_s % ('\t  property', '  ratio', '  weight',
                             'relative_weight', '  r*w', 'r * relative_w')
            s += '\t' + '-' * 100 + '\n'
            format_s = '\t%-30s | %-10f | %-10f | %-10f | %-10f | %-10f\n'
            s += format_s % (
                ' normal_form', ratio_normal_form, weight_normal_form,
                old_div(weight_normal_form,
                        teller), ratio_normal_form * weight_normal_form,
                old_div(ratio_normal_form * weight_normal_form, teller))
            s += format_s % (
                'soundex van normal_form', ratio_normal_form_soundex,
                weight_normal_form_soundex,
                old_div(weight_normal_form_soundex, teller),
                ratio_normal_form_soundex * weight_normal_form_soundex,
                old_div(ratio_normal_form_soundex * weight_normal_form_soundex,
                        teller))
            s += format_s % (
                'soundex van geslachtsnaam1', ratio_geslachtsnaam1,
                weight_geslachtsnaam1, old_div(weight_geslachtsnaam1, teller),
                ratio_geslachtsnaam1 * weight_geslachtsnaam1,
                old_div(ratio_geslachtsnaam1 * weight_geslachtsnaam1, teller))
            s += format_s % (
                'geslachtsnaam', ratio_geslachtsnaam2, weight_geslachtsnaam2,
                old_div(weight_geslachtsnaam2,
                        teller), ratio_geslachtsnaam2 * weight_geslachtsnaam2,
                old_div(ratio_geslachtsnaam2 * weight_geslachtsnaam2, teller))
            s += format_s % ('initials', ratio_initials, weight_initials,
                             old_div(weight_initials,
                                     teller), ratio_initials * weight_initials,
                             old_div(ratio_initials * weight_initials, teller))
            s += '\tTOTAL  (numerator)                                       | %s (counter = %s)\n' % (
                teller, noemer)
            return s
            return '\n'.join(['%s: %s' % (k, v) for k, v in d])
        return final_ratio