Esempio n. 1
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="searchResult[^"]*"', body):
        if 'intro' in m.group():
            continue

        sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub)
        if m1 is None:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = data['city']

        addr_list = [
            tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        else:
            m1 = re.search(ur'Tel:([^<>]+)', sub)
            if m1 is not None:
                entry[cm.tel] = cm.extract_tel(m1.group(1))
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        start = sub.find(ur'Opening hours:')
        if start != -1:
            entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>',
                                                 ur'</p>')[0].strip()

        ret = None
        if entry[cm.lat] != '' and entry[cm.lng] != '':
            ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'Max Mara' in tmp[0]:
                del tmp[0]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code

        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 2
0
def sense_cities(lower_bound='a', upper_bound='b'):
    """
    规则化城市字段
    """

    def get_unique_latlng(latlng_list, tol_lat=0.5, tol_lng=1):
        """
        从一组经纬度数据点中,去掉距离过远的数据点,取得集中的坐标。
        :param latlng_list:
        :param tol_lat: 纬度的容忍度。
        :param tol_lng: 经度的容忍度。
        """

        def get_avg(l):
            return float(sum(l)) / len(l) if len(l) > 0 else None

        def func(vals, tol):
            vals = list(vals)
            avg = None
            while True:
                avg = get_avg(vals)
                if not avg:
                    break
                max_dist = sorted(tuple({'idx': idx, 'dist': abs(vals[idx] - avg)} for idx in xrange(len(vals))),
                                  key=lambda arg: arg['dist'])[-1]
                if max_dist['dist'] < tol:
                    break
                elif len(vals) == 2:
                    # 如果只有两个数据点,且相互离散,则该方法失效
                    avg = None
                    break
                else:
                    del vals[max_dist['idx']]
            return avg

        lat = func((tmp[0] for tmp in latlng_list), tol_lat)
        lng = func((tmp[1] for tmp in latlng_list), tol_lng)
        return (lat, lng)


    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh['administrative_area_level_1']

        std_sig = u'|'.join((std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True

    city_std = {}
    log_name = u'sense_cities.log'
    try:
        with open('data/city_std.dat', 'r') as f:
            # {'city|region|country':{'std_info':{'city':...,'region':...,'country':...}, 'geo_result': result}}
            # 城市的标准化映射信息
            city_std = json.loads(f.readlines()[0])
    except IOError:
        common.dump(u'Failed to load data/city_std.dat', log_name)

    db = common.StoresDb()
    db.connect_db(host='localhost', port=3306, user='******', passwd='123456', db='brand_stores')
    tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND (is_geocoded<4 OR is_geocoded>7) ORDER BY city_e, province_e, country_e LIMIT 99999"
    # tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND is_geocoded=6 ORDER BY city_e, province_e, country_e LIMIT 99999"
    tpl_pos = "SELECT lat, lng, addr_e, idstores FROM stores WHERE city_e='%s' AND province_e='%s' AND country_e='%s' LIMIT 99999"
    tpl_geocoded = "UPDATE stores SET is_geocoded=%d WHERE city_e='%s' AND province_e='%s' AND country_e='%s'"

    statement = tpl_entity % (lower_bound, upper_bound)
    common.dump(u"Processing cities from '%s' to '%s'..." % (lower_bound, upper_bound), log_name)
    for item in db.query_all(statement):
        try:
            sig = u'|'.join(item[i] for i in xrange(3))
            if sig in city_std:
                common.dump(u'Geo item %s already processed.' % sig, log_name)
                tmp1 = [7]
                tmp1.extend(tmp.replace("'", r"\'") for tmp in (item[i] for i in xrange(3)))
                statement = tpl_geocoded % tuple(tmp1)
                db.execute(statement)
                continue
            common.dump(u'Processing %s...' % sig, log_name)

            city_e, province_e, country_e = item
            geo_success = False
            statement = tpl_pos % tuple(tmp.replace("'", r"\'") for tmp in item)
            query_result = db.query_all(statement)
            # 使用经纬度进行查询
            latlng_list = []
            for lat, lng, addr, idstores in query_result:
                if not lat or not lng or lat == '' or lng == '':
                    continue
                latlng_list.append(tuple(map(string.atof, (lat, lng))))

            lat, lng = get_unique_latlng(latlng_list)
            if lat and lng:
                tmp = gs.geocode(latlng='%f,%f' % (lat, lng))
                if tmp:
                    geo_success = register_city(tmp)
            if geo_success:
                # 通过经纬度获得
                tmp1 = [4]
                tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                statement = tpl_geocoded % tuple(tmp1)
                db.execute(statement)
            else:
                for lat, lng, addr, idstores in query_result:
                    # 使用地址进行查询
                    tmp = gs.geocode(u'%s,%s,%s' % (city_e, province_e, country_e))
                    if not tmp:
                        continue
                    geo_success = register_city(tmp)
                    if geo_success:
                        break

                    tmp = gs.geocode(addr)
                    if not tmp:
                        continue
                    geo_success = register_city(tmp)
                    if geo_success:
                        break
                if geo_success:
                    # 通过地址成功获得
                    tmp1 = [5]
                    tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                    statement = tpl_geocoded % tuple(tmp1)
                    db.execute(statement)
                else:
                    # 未能获得
                    tmp1 = [6]
                    tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                    statement = tpl_geocoded % tuple(tmp1)
                    db.execute(statement)

            with open(u'data/city_std.dat', 'w') as f:
                f.write(json.dumps(city_std).encode('utf-8'))
        except Exception as e:
            common.dump(traceback.format_exc(), log_name)

    common.dump(u'Done!', log_name)
Esempio n. 3
0
    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh['administrative_area_level_1']

        std_sig = u'|'.join((std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True
Esempio n. 4
0
            entry[cm.city_e], entry[cm.city_c] = city_e, city_c
            entry[cm.name_e]=name
            entry[cm.addr_e]=name

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e])
            if ret[0] is not None and entry[cm.country_e] == '':
                entry[cm.country_e] = ret[0]
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            if entry[cm.country_e] == '' or entry[cm.city_e] == '':
                ret = gs.geocode(', '.join((entry[cm.name_e], entry[cm.city_c], entry[cm.country_c])))
                if not ret:
                    ret = gs.geocode(', '.join((entry[cm.city_c], entry[cm.country_c])))
                if ret:
                    city = ''
                    province = ''
                    country = ''
                    zip_code = ''
                    tmp = ret[0]['address_components']
                    for v in tmp:
                        if 'locality' in v['types']:
                            city = v['long_name'].strip().upper()
                        elif 'administrative_area_level_1' in v['types']:
                            province = v['long_name'].strip().upper()
                        elif 'country' in v['types']:
                            country = v['long_name'].strip().upper()
Esempio n. 5
0
        type_list = []
        for item in pq(body)('#map-panel ul li'):
            if item.text:
                val = cm.html2plain(item.text).strip()
                if val != '':
                    type_list.append(val)
        entry[cm.store_type] = ', '.join(type_list)

        tmp = pq(body)('#map-panel iframe[src!=""]')
        if len(tmp) > 0:
            # map_url = tmp[0].attrib['src']
            m = re.search(ur'daddr=([^&]+)', tmp[0].attrib['src'])
            if m:
                map_url = 'http://maps.googleapis.com/maps/api/geocode/json?address=%s&sensor=false' % m.group(1)
                ret = gs.geocode(url=map_url)
                if ret:
                    city = ''
                    province = ''
                    country = ''
                    zip_code = ''
                    tmp = ret[0]['address_components']
                    for v in tmp:
                        if 'locality' in v['types']:
                            city = v['long_name'].strip().upper()
                        elif 'administrative_area_level_1' in v['types']:
                            province = v['long_name'].strip().upper()
                        elif 'country' in v['types']:
                            country = v['long_name'].strip().upper()
                        elif 'postal_code' in v['types']:
                            zip_code = v['long_name'].strip()
Esempio n. 6
0
                    entry[cm.lng] = string.atof(m2.group(2))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.city_e] == '' or entry[cm.country_e] == '':
            ret = None
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone'])))
            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
                    elif 'country' in v['types']:
                        country = v['long_name'].strip().upper()
Esempio n. 7
0
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if (entry[cm.country_e] == '' or entry[cm.city_e] == ''):
            ret = None
            location_valid = True
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                location_valid = False
                ret = gs.geocode('%s, %s, %s' % (entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e]))

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
Esempio n. 8
0
def sense_cities(lower_bound='a', upper_bound='b'):
    """
    规则化城市字段
    """
    def get_unique_latlng(latlng_list, tol_lat=0.5, tol_lng=1):
        """
        从一组经纬度数据点中,去掉距离过远的数据点,取得集中的坐标。
        :param latlng_list:
        :param tol_lat: 纬度的容忍度。
        :param tol_lng: 经度的容忍度。
        """
        def get_avg(l):
            return float(sum(l)) / len(l) if len(l) > 0 else None

        def func(vals, tol):
            vals = list(vals)
            avg = None
            while True:
                avg = get_avg(vals)
                if not avg:
                    break
                max_dist = sorted(tuple({
                    'idx': idx,
                    'dist': abs(vals[idx] - avg)
                } for idx in xrange(len(vals))),
                                  key=lambda arg: arg['dist'])[-1]
                if max_dist['dist'] < tol:
                    break
                elif len(vals) == 2:
                    # 如果只有两个数据点,且相互离散,则该方法失效
                    avg = None
                    break
                else:
                    del vals[max_dist['idx']]
            return avg

        lat = func((tmp[0] for tmp in latlng_list), tol_lat)
        lng = func((tmp[1] for tmp in latlng_list), tol_lng)
        return (lat, lng)

    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (
                    ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e']
                                                            if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'],
                                      lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh[
                    'administrative_area_level_1']

        std_sig = u'|'.join(
            (std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True

    city_std = {}
    log_name = u'sense_cities.log'
    try:
        with open('data/city_std.dat', 'r') as f:
            # {'city|region|country':{'std_info':{'city':...,'region':...,'country':...}, 'geo_result': result}}
            # 城市的标准化映射信息
            city_std = json.loads(f.readlines()[0])
    except IOError:
        common.dump(u'Failed to load data/city_std.dat', log_name)

    db = common.StoresDb()
    db.connect_db(host='localhost',
                  port=3306,
                  user='******',
                  passwd='123456',
                  db='brand_stores')
    tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND (is_geocoded<4 OR is_geocoded>7) ORDER BY city_e, province_e, country_e LIMIT 99999"
    # tpl_entity = "SELECT DISTINCT city_e, province_e, country_e FROM stores WHERE city_e>'%s' AND city_e<'%s' AND is_geocoded=6 ORDER BY city_e, province_e, country_e LIMIT 99999"
    tpl_pos = "SELECT lat, lng, addr_e, idstores FROM stores WHERE city_e='%s' AND province_e='%s' AND country_e='%s' LIMIT 99999"
    tpl_geocoded = "UPDATE stores SET is_geocoded=%d WHERE city_e='%s' AND province_e='%s' AND country_e='%s'"

    statement = tpl_entity % (lower_bound, upper_bound)
    common.dump(
        u"Processing cities from '%s' to '%s'..." % (lower_bound, upper_bound),
        log_name)
    for item in db.query_all(statement):
        try:
            sig = u'|'.join(item[i] for i in xrange(3))
            if sig in city_std:
                common.dump(u'Geo item %s already processed.' % sig, log_name)
                tmp1 = [7]
                tmp1.extend(
                    tmp.replace("'", r"\'")
                    for tmp in (item[i] for i in xrange(3)))
                statement = tpl_geocoded % tuple(tmp1)
                db.execute(statement)
                continue
            common.dump(u'Processing %s...' % sig, log_name)

            city_e, province_e, country_e = item
            geo_success = False
            statement = tpl_pos % tuple(
                tmp.replace("'", r"\'") for tmp in item)
            query_result = db.query_all(statement)
            # 使用经纬度进行查询
            latlng_list = []
            for lat, lng, addr, idstores in query_result:
                if not lat or not lng or lat == '' or lng == '':
                    continue
                latlng_list.append(tuple(map(string.atof, (lat, lng))))

            lat, lng = get_unique_latlng(latlng_list)
            if lat and lng:
                tmp = gs.geocode(latlng='%f,%f' % (lat, lng))
                if tmp:
                    geo_success = register_city(tmp)
            if geo_success:
                # 通过经纬度获得
                tmp1 = [4]
                tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                statement = tpl_geocoded % tuple(tmp1)
                db.execute(statement)
            else:
                for lat, lng, addr, idstores in query_result:
                    # 使用地址进行查询
                    tmp = gs.geocode(u'%s,%s,%s' %
                                     (city_e, province_e, country_e))
                    if not tmp:
                        continue
                    geo_success = register_city(tmp)
                    if geo_success:
                        break

                    tmp = gs.geocode(addr)
                    if not tmp:
                        continue
                    geo_success = register_city(tmp)
                    if geo_success:
                        break
                if geo_success:
                    # 通过地址成功获得
                    tmp1 = [5]
                    tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                    statement = tpl_geocoded % tuple(tmp1)
                    db.execute(statement)
                else:
                    # 未能获得
                    tmp1 = [6]
                    tmp1.extend(tmp.replace("'", r"\'") for tmp in item)
                    statement = tpl_geocoded % tuple(tmp1)
                    db.execute(statement)

            with open(u'data/city_std.dat', 'w') as f:
                f.write(json.dumps(city_std).encode('utf-8'))
        except Exception as e:
            common.dump(traceback.format_exc(), log_name)

    common.dump(u'Done!', log_name)
Esempio n. 9
0
    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (
                    ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e']
                                                            if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'],
                                      lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh[
                    'administrative_area_level_1']

        std_sig = u'|'.join(
            (std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True
Esempio n. 10
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))
        m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub)
        if m1 is None:
            continue
        ret = gs.geocode(latlng=m1.group(1))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'MAX' in tmp[0]:
                del tmp[0]
            if cm.extract_tel(tmp[-1])!='':
                del tmp[-1]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code
            gs.field_sense(entry)
            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
        else:
            cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name)
            continue
Esempio n. 11
0
        entry = fetch_contact_info(data, entry, s['id'])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == '' or entry[cm.city_e] == '':
            ret = None
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' %
                                 (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                ret = gs.geocode(', '.join((entry[cm.addr_e], s['city'])))

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
                    elif 'country' in v['types']:
Esempio n. 12
0
        entry = fetch_contact_info(data, entry, s["id"])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == "":
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == "" or entry[cm.city_e] == "":
            ret = None
            if entry[cm.lat] != "" and entry[cm.lng] != "":
                ret = gs.geocode(latlng="%f,%f" % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                ret = gs.geocode(", ".join((entry[cm.addr_e], s["city"])))

            if ret is not None:
                city = ""
                province = ""
                country = ""
                zip_code = ""
                tmp = ret[0]["address_components"]
                for v in tmp:
                    if "locality" in v["types"]:
                        city = v["long_name"].strip().upper()
                    elif "administrative_area_level_1" in v["types"]:
                        province = v["long_name"].strip().upper()
                    elif "country" in v["types"]:
Esempio n. 13
0
    sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
    m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    entry[cm.addr_e] = cm.reformat_addr(m.group(1))
    m = re.search(ur'<h4>(.+?)</h4>', sub)
    if m is not None and 't:' in m.group(1).lower():
        entry[cm.tel] = cm.extract_tel(m.group(1))
    m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    ret = None
    if entry[cm.lat] != '' and entry[cm.lng] != '':
        ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
    if ret is None:
        ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone'])))
    if ret is not None:
        city = ''
        province = ''
        country = ''
        zip_code = ''
        tmp = ret[0]['address_components']
        for v in tmp:
            if 'locality' in v['types']:
                city = v['long_name'].strip().upper()
            elif 'administrative_area_level_1' in v['types']:
                province = v['long_name'].strip().upper()
            elif 'country' in v['types']:
                country = v['long_name'].strip().upper()
Esempio n. 14
0
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == "":
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == "" or entry[cm.city_e] == "":
            ret = None
            location_valid = True
            if entry[cm.lat] != "" and entry[cm.lng] != "":
                ret = gs.geocode(latlng="%f,%f" % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                location_valid = False
                ret = gs.geocode("%s, %s, %s" % (entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e]))

            if ret is not None:
                city = ""
                province = ""
                country = ""
                zip_code = ""
                tmp = ret[0]["address_components"]
                for v in tmp:
                    if "locality" in v["types"]:
                        city = v["long_name"].strip().upper()
                    elif "administrative_area_level_1" in v["types"]:
                        province = v["long_name"].strip().upper()
Esempio n. 15
0
            entry[cm.lng] = string.atof(m1.group(2))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == '' or entry[cm.city_e] == '':
            ret = None
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                ret = gs.geocode(entry[cm.addr_e])

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
                    elif 'country' in v['types']:
Esempio n. 16
0
            entry[cm.city_e], entry[cm.city_c] = city_e, city_c
            entry[cm.name_e] = name
            entry[cm.addr_e] = name

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e])
            if ret[0] is not None and entry[cm.country_e] == '':
                entry[cm.country_e] = ret[0]
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            if entry[cm.country_e] == '' or entry[cm.city_e] == '':
                ret = gs.geocode(', '.join(
                    (entry[cm.name_e], entry[cm.city_c], entry[cm.country_c])))
                if not ret:
                    ret = gs.geocode(', '.join(
                        (entry[cm.city_c], entry[cm.country_c])))
                if ret:
                    city = ''
                    province = ''
                    country = ''
                    zip_code = ''
                    tmp = ret[0]['address_components']
                    for v in tmp:
                        if 'locality' in v['types']:
                            city = v['long_name'].strip().upper()
                        elif 'administrative_area_level_1' in v['types']:
                            province = v['long_name'].strip().upper()
                        elif 'country' in v['types']:
Esempio n. 17
0
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == '' or entry[cm.city_e] == '':
            ret = None
            location_valid = True
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' %
                                 (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                location_valid = False
                ret = gs.geocode(', '.join(
                    (entry[cm.addr_e], entry[cm.country_e])))

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
Esempio n. 18
0
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == '' or entry[cm.city_e] == '':
            ret = None
            location_valid = True
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                location_valid = False
                ret = gs.geocode(', '.join((entry[cm.addr_e], entry[cm.country_e])))

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
Esempio n. 19
0
            entry[cm.lng] = string.atof(m1.group(2))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        if entry[cm.country_e] == '' or entry[cm.city_e] == '':
            ret = None
            if entry[cm.lat] != '' and entry[cm.lng] != '':
                ret = gs.geocode(latlng='%f,%f' %
                                 (entry[cm.lat], entry[cm.lng]))
            if ret is None:
                ret = gs.geocode(entry[cm.addr_e])

            if ret is not None:
                city = ''
                province = ''
                country = ''
                zip_code = ''
                tmp = ret[0]['address_components']
                for v in tmp:
                    if 'locality' in v['types']:
                        city = v['long_name'].strip().upper()
                    elif 'administrative_area_level_1' in v['types']:
                        province = v['long_name'].strip().upper()
                    elif 'country' in v['types']:
Esempio n. 20
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    m = re.search(ur'var\s+geoShops\s*=', body)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    tmp = cm.extract_closure(body[m.end():], ur'\[', ur'\]')[0]
    raw = json.loads(re.sub(ur'(?<!")(city|address|lat|lng)(?!")', ur'"\1"', tmp))

    store_list = []
    for s in raw:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.city_e] = s['city'].strip().upper()
        if s['lat'] is not None and s['lat'] != '':
            entry[cm.lat] = string.atof(s['lat'])
        if s['lng'] is not None and s['lng'] != '':
            entry[cm.lng] = string.atof(s['lng'])

        addr = cm.reformat_addr(s['address'])
        pat = re.compile(ur'ph[\.:](.*)$', re.I)
        m = re.search(pat, addr)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        entry[cm.addr_e] = re.sub(pat, '', addr).strip()

        addr1 = re.sub(ur'[\u2e80-\u9fff]+', '', '%s, %s' % (addr, s['city'])).strip()
        ret = gs.geocode(addr1, '%f,%f' % (entry[cm.lat], entry[cm.lng]))
        if ret is None:
            ret = gs.geocode(addr1)
        if ret is None:
            ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))

        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code
        else:
            ret = gs.addr_sense(addr1)
            if ret[0] is not None:
                entry[cm.country_e] = ret[0]
            if ret[1] is not None:
                entry[cm.province_e] = ret[1]
            if ret[2] is not None:
                entry[cm.city_e] = ret[2]

        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Esempio n. 21
0
            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e])
            if ret[0] is not None and entry[cm.country_e] == '':
                entry[cm.country_e] = ret[0]
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            if entry[cm.country_e] == '' or entry[cm.city_e] == '':
                ret = None
                location_valid = True
                if entry[cm.lat] != '' and entry[cm.lng] != '':
                    ret = gs.geocode(latlng='%f,%f' %
                                     (entry[cm.lat], entry[cm.lng]))
                if ret is None:
                    location_valid = False
                    ret = gs.geocode('%s, %s, %s' %
                                     (entry[cm.addr_e], entry[cm.city_e],
                                      entry[cm.country_e]))

                if ret is not None:
                    city = ''
                    province = ''
                    country = ''
                    zip_code = ''
                    tmp = ret[0]['address_components']
                    for v in tmp:
                        if 'locality' in v['types']:
                            city = v['long_name'].strip().upper()
Esempio n. 22
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="searchResult[^"]*"', body):
        if 'intro' in m.group():
            continue

        sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub)
        if m1 is None:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = data['city']

        addr_list = [tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        else:
            m1 = re.search(ur'Tel:([^<>]+)', sub)
            if m1 is not None:
                entry[cm.tel] = cm.extract_tel(m1.group(1))
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        start = sub.find(ur'Opening hours:')
        if start != -1:
            entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip()

        ret = None
        if entry[cm.lat]!='' and entry[cm.lng]!='':
            ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'Max Mara' in tmp[0]:
                del tmp[0]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code

        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)