Example #1
0
def fetch_store_detail(s, data, isOfficial=False):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

    entry[cm.name_e] = cm.html2plain(s['name']).strip()
    entry[cm.country_e] = data['country']
    val = cm.html2plain(s['city']).strip().upper()
    entry[cm.city_e] = cm.extract_city(val if val and val != '' else data['city'])[0]
    entry[cm.addr_e] = cm.html2plain(s['address']).strip()
    entry[cm.email] = s['email'].strip()
    entry[cm.tel] = s['phone'].strip()
    entry[cm.fax] = s['fax'].strip()
    entry[cm.store_class] = 'Official Retailer' if isOfficial else 'Retailer'

    try:
        entry[cm.lat] = string.atof(s['lat']) if s['lat'] != '' else ''
    except (ValueError, KeyError, TypeError) as e:
        cm.dump('Error in fetching lat: %s' % str(e), log_name)
    try:
        entry[cm.lng] = string.atof(s['lng']) if s['lng'] != '' else ''
    except (ValueError, KeyError, TypeError) as e:
        cm.dump('Error in fetching lng: %s' % str(e), log_name)

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    return entry
Example #2
0
def fetch_store_details(data):
    url = data['host'] + data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    start = body.find(ur'<h3>available in store</h3>')
    if start != -1:
        type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]
        entry[cm.store_type] = ', '.join(
            cm.html2plain(tmp).strip()
            for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S))

    start = body.find(ur"<div class='gmap_info_box'")
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table']
    entry[cm.name_e] = cm.html2plain(raw['name'])
    entry[cm.city_e] = data['city'].strip().upper()
    entry[cm.country_e] = data['country'].strip().upper()
    # entry[cm.store_type] = data['store_type']
    entry[cm.addr_e] = cm.reformat_addr(raw['address'])
    m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S),
                  raw['phone'])
    if m is not None:
        entry[cm.tel] = m.group(1).strip()
        entry[cm.fax] = m.group(2).strip()
    else:
        m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
    entry[cm.hours] = raw['hours']
    if raw['lat'] is not None and raw['lat'] != '':
        entry[cm.lat] = string.atof(raw['lat'])
    if raw['lng'] is not None and raw['lng'] != '':
        entry[cm.lat] = string.atof(raw['lng'])
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None:
        entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

    cm.dump(
        '(%s / %d) Found store: %s, %s (%s, %s)' %
        (data['brandname_e'], data['brand_id'], entry[cm.name_e],
         entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
        log_name)
    db.insert_record(entry, 'stores')
    return [entry]
Example #3
0
    def func(s):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.native_id] = int(s['id'])
        if entry[cm.native_id] in data['store_list']:
            return

        entry[cm.lat] = float(s['latitude'])
        entry[cm.lng] = float(s['longitude'])
        entry[cm.email] = s['email']
        entry[cm.fax] = s['fax']
        entry[cm.store_class] = ' | '.join((str.format('ISCHANEL:{0}', 'YES' if s['ischanel'] != 0 else 'NO'),
                                            s['postypename']))
        try:
            entry[cm.hours] = ' | '.join(
                map(lambda val: ':'.join((val['day'], val['opening'] if 'opening' in val else '')), s['openinghours']))
        except TypeError as e:
            pass
        entry[cm.tel] = s['phone']

        trans = s['translations'][0]
        entry[cm.addr_e] = cm.html2plain(
            ', '.join(filter(lambda val: val, (trans[key] for key in ('address1', 'address2')))))
        entry[cm.city_e] = cm.html2plain(trans['cityname'].strip().upper())
        entry[cm.name_e] = cm.html2plain(trans['name'])
        entry[cm.province_e] = cm.html2plain(trans['statename']).strip().upper()
        entry[cm.store_type] = ', '.join(temp['name'] for temp in trans['products'])
        entry[cm.url] = s['website']
        entry[cm.zip_code] = s['zipcode']

        country_id = s['country_id'] if 'country_id' in s else None
        if country_id and country_id in data['country_map']:
            entry[cm.country_e] = data['country_map'][country_id]
        else:
            ret = gs.geocode2(latlng=str.format('{0},{1}', entry[cm.lat], entry[cm.lng]), logger=logger)
            country_e = None
            if len(ret) > 0:
                for item in ret[0]['address_components']:
                    if 'country' in item['types']:
                        country_e = item['long_name'].strip().upper()
                        break
            if not country_e:
                country_e = raw_input(unicode.format(u'INPUT THE COUNTRY NAME FOR {0} AT {1}, {2}',
                                                     entry[cm.city_e], entry[cm.lat], entry[cm.lng])).decode('utf-8')
            if not country_e:
                # 无法确定国家名称,放弃该记录
                return
            entry[cm.country_e] = country_e
            if country_id:
                data['country_map'][country_id] = country_e

        logger.info(('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                 entry[cm.name_e], entry[cm.addr_e],
                                                                 entry[cm.country_e],
                                                                 entry[cm.continent_e])))
        cm.insert_record(db, entry, data['table'])

        data['store_list'].add(entry[cm.native_id])
Example #4
0
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(store_entry,
                        {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(),
                         cm.country_e: opt[cm.country_e].strip().upper(),
                         cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour,
                         cm.tel: store_tel})
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e],
            store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
Example #5
0
def fetch_store_details(data):
    url = data['host'] + data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    start = body.find(ur'<h3>available in store</h3>')
    if start != -1:
        type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]
        entry[cm.store_type] = ', '.join(
            cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S))

    start = body.find(ur"<div class='gmap_info_box'")
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table']
    entry[cm.name_e] = cm.html2plain(raw['name'])
    entry[cm.city_e] = data['city'].strip().upper()
    entry[cm.country_e] = data['country'].strip().upper()
    # entry[cm.store_type] = data['store_type']
    entry[cm.addr_e] = cm.reformat_addr(raw['address'])
    m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone'])
    if m is not None:
        entry[cm.tel] = m.group(1).strip()
        entry[cm.fax] = m.group(2).strip()
    else:
        m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
    entry[cm.hours] = raw['hours']
    if raw['lat'] is not None and raw['lat'] != '':
        entry[cm.lat] = string.atof(raw['lat'])
    if raw['lng'] is not None and raw['lng'] != '':
        entry[cm.lat] = string.atof(raw['lng'])
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None:
        entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')
    return [entry]
Example #6
0
def fetch_hk(data):
    loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories')
    url = 'http://levi.com.hk/hk/storelocator'
    store_list = []
    for loc in loc_list:
        param = {'loc': loc}
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue

        start = body.find(ur'<div id="addWrapper">')
        if start == -1:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue
        sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0]
        for s in re.findall(ur'<li>(.+?)</li>', sub, re.S):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])
            entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG'
            entry[cm.city_e] = entry[cm.country_e]

            m = re.search(ur'<div id="addStore">([^<>]+)', s)
            entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else ''

            m = re.search(ur'<div id="addAddress">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.hours] = re.sub(pat, '', tmp).strip()

            m = re.search(ur'<div id="addPhone">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.tel] = re.sub(pat, '', tmp).strip()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            cm.dump(
                '(%s / %d) Found store: %s, %s (%s, %s)' %
                (data['brandname_e'], data['brand_id'], entry[cm.name_e],
                 entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
                log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
Example #7
0
def fetch_stores(data):
    url = data['data_url']
    param = {'output': 'json', 'country': data['country_code'], 'brand': 'dkny'}
    page = 0
    tot_page = -1
    store_list = []
    while True:
        page += 1
        if tot_page != -1 and page > tot_page:
            break
        param['p'] = page
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
            return ()

        raw = json.loads(body)
        tot_page = raw['Stores']['TotalPages']
        if data['country_code'] not in region_map:
            # 构造州列表
            region_map[data['country_code']] = dict((item['RegionId'], item['Name']) for item in raw['Regions'])

        for s in raw['Stores']['Items']:
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = data['country_code'].upper()
            entry[cm.city_e] = cm.extract_city(s['City'])[0]
            entry[cm.name_e] = cm.html2plain(s['Name']).strip()
            entry[cm.addr_e] = cm.reformat_addr(s['Address'])
            entry[cm.tel] = s['Phone'].strip() if s['Phone'] else ''
            entry[cm.fax] = s['Fax'].strip() if s['Fax'] else ''
            entry[cm.email] = s['Email'].strip() if s['Email'] else ''
            entry[cm.lat] = s['Latitude'] if s['Latitude'] else ''
            entry[cm.lng] = s['Longitude'] if s['Longitude'] else ''
            region_id = s['RegionId']
            if region_id in region_map[data['country_code']]:
                entry[cm.province_e] = cm.html2plain(region_map[data['country_code']][region_id]).strip().upper()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
Example #8
0
def fetch_hk(data):
    loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories')
    url = 'http://levi.com.hk/hk/storelocator'
    store_list = []
    for loc in loc_list:
        param = {'loc': loc}
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue

        start = body.find(ur'<div id="addWrapper">')
        if start == -1:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue
        sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0]
        for s in re.findall(ur'<li>(.+?)</li>', sub, re.S):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG'
            entry[cm.city_e] = entry[cm.country_e]

            m = re.search(ur'<div id="addStore">([^<>]+)', s)
            entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else ''

            m = re.search(ur'<div id="addAddress">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.hours] = re.sub(pat, '', tmp).strip()

            m = re.search(ur'<div id="addPhone">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.tel] = re.sub(pat, '', tmp).strip()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
Example #9
0
    def func(item):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip()
        addr_sub = unicode(pq(item('p')[0]))
        addr_list = [
            term.strip() for term in cm.reformat_addr(addr_sub).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

        temp = item('a.track_map[href]')
        m = hashlib.md5()
        m.update(url)
        if len(temp) > 0:
            map_ref = temp[0].attrib['href']
            m.update(map_ref)
            m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref))
            if m_query:
                query_parm = m_query.group(1).replace('+', ' ')
                entry['geo_query_param'] = query_parm

        else:
            m.update(entry[cm.addr_e])
        fingerprint = m.hexdigest()
        entry[cm.native_id] = fingerprint
        if entry[cm.native_id] in data['store_list']:
            return

        entry[cm.country_e] = data['country']
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        logger.info(
            ('(%s / %d) Found store: %s, %s (%s, %s)' %
             (data['brandname_e'], data['brand_id'], entry[cm.name_e],
              entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])))
        cm.insert_record(db, entry, data['table'])
        return entry
Example #10
0
def fetch_store_details(data):
    # http://maps.oasis-stores.com/index-v2.php?coutnryISO=GB&brand=oasis&lat=51.42014&lng=-0.20954
    url = data['store_url']
    code = data['country_code']
    city = data['city_e']

    try:
        html = cm.get_data(url, {
            'latitude': data['lat'],
            'longitude': data['lng'],
            'brand': 'oasis'
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    raw = json.loads(html)
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    entry[cm.name_e] = raw['name']
    addr_list = []
    for i in xrange(1, 4):
        tmp = cm.html2plain(raw['address%d' % i]).strip()
        if tmp != '':
            addr_list.append(tmp)
    entry[cm.addr_e] = ', '.join(addr_list)
    state = raw['countryRegion']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    state = raw['state']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    state = raw['county']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    entry[cm.zip_code] = raw['postcode']
    entry[cm.country_e] = data['country_e']
    entry[cm.city_e] = cm.extract_city(data['city_e'])[0]
    entry[cm.lat] = string.atof(data['lat'])
    entry[cm.lng] = string.atof(data['lng'])
    entry[cm.tel] = raw['phone']
    entry[cm.email] = raw['email']
    tmp = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    entry[cm.hours] = ', '.join([raw[d + '_open_times'] for d in tmp])
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        data['brandname_e'], data['brand_id'], entry[cm.name_e],
        entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
    db.insert_record(entry, 'stores')

    return [entry]
Example #11
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e = ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Example #12
0
 def func(item):
     d = data.copy()
     d['node_id'] = item.attrib['id']
     d['country_id'] = d['node_id']
     d['country'] = cm.html2plain(item.text).strip().upper()
     d['url'] = d['host'] + item.attrib['href']
     return d
Example #13
0
def fetch_stores(data):
    # <h2 property="dc:title"
    url = data[cm.url]
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<h2 property="dc:title"', html):
        end = html.find('</header>', m.start())
        if end == -1:
            continue
        sub = html[m.start():end]
        m1 = re.search(ur'<a href="(.+?)">(.+?)</a></h2>', sub)
        if m1 is None:
            print 'Error: no more details for %s' % url
            continue
        d = data.copy()
        d[cm.url] = data['host'] + m1.group(1)
        d[cm.name_e] = cm.html2plain(m1.group(2)).strip()
        store_list.append(d)
Example #14
0
def fetch_states(data):
    global national_added

    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching states: %s' % url, log_name)
        return []

    national_added = False

    m = re.search(ur'Choose a (state|region|province)', body)
    if m is None:
        d = data.copy()
        d['state'] = ''
        return [d]

    body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0]
    results = []
    for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body):
        d = data.copy()
        d['url'] = data['host'] + m[0]
        d['state'] = cm.html2plain(m[1]).strip().upper()
        results.append(d)
Example #15
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e=ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Example #16
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'rsp': 'json', 'country': data['country_code']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []
    for s in raw['stores']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(s['name']).strip()

        addr_list = []
        for key in ['address1', 'address2']:
            if s[key].strip() != '':
                addr_list.append(cm.reformat_addr(s[key]))
        entry[cm.addr_e] = ' '.join(addr_list)

        # r=s['region'].strip().upper()
        # m = re.search(ur'\b([A-Z]{2})\b', r)
        # if data[cm.country_e]=='UNITED STATES' and m is not None:
        #     # 美国
        #     ret = gs.look_up(m.group(1), 2)
        #     if ret is not None:
        #         r = ret['name_e']
        # entry[cm.province_e] = r

        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        entry[cm.zip_code] = s['zip'].strip()
        entry[cm.country_e] = data[cm.country_e]
        entry[cm.lat] = string.atof(s['lat'])
        entry[cm.lng] = string.atof(s['lng'])
        entry[cm.tel] = s['phone'].strip()
        entry[cm.fax] = s['fax'].strip()
        entry[cm.email] = s['emailaddress'].strip()
        entry[cm.url] = s['website'].strip()

        days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
        opening = []
        if 'openingHours' in s and s['openingHours'] is not None:
            for m in re.finditer(ur'i:(\d);s:\d+:\\?"([^\\"]+?)\\?"', s['openingHours']):
                opening.append('%s: %s' % (days[string.atoi(m.group(1))], m.group(2).strip()))
            entry[cm.hours] = ', '.join(opening)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                                                              entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #17
0
def fetch_states(data):
    global national_added

    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching states: %s' % url, log_name)
        return []

    national_added = False

    m = re.search(ur'Choose a (state|region|province)', body)
    if m is None:
        d = data.copy()
        d['state'] = ''
        return [d]

    body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0]
    results = []
    for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body):
        d = data.copy()
        d['url'] = data['host'] + m[0]
        d['state'] = cm.html2plain(m[1]).strip().upper()
        results.append(d)
Example #18
0
def fetch_store_list(url):
    """
    获得门店的列表
    :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}]
    :param url: 
    """
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'data': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    # 开始解析工作
    # 查找数据部分,位于var items和var\s\w+之间
    start = html.find('var items')
    if start == -1:
        return {}
    start += len('var items')
    end = html.find('var ', start)
    html = html[start:end]
    stores = []

    pattern = ur'\[(.+?)\]'
    store_list = []
    for m in re.findall(pattern, html, re.S):
        store_entry = {}
        m_list = re.findall(ur"'(.*)'", m)
        try:
            store_entry['name'] = cm.html2plain(m_list[0].strip())
            store_entry['type'] = m_list[2].strip()
            store_entry['url'] = m_list[4].strip()
        except IndexError:
            print 'Index error: %s' % m
            # 去掉引号之间的内容,准备查找经纬度信息
        m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0])
        try:
            lat = string.atof(m_list[0])
            lng = string.atof(m_list[1])
            store_entry['lat'] = lat
            store_entry['lng'] = lng
        except (IndexError, ValueError):
            print 'Index error in getting coordinates: %s' % m

        # test
        # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']:
        if len(store_entry.keys()) > 0:
            store_list.append(store_entry)
    return store_list
Example #19
0
    def process_text(self, val):
        val = cm.html2plain(val.strip())
        # <br/>换成换行符
        val = re.sub(ur'<\s*br\s*/?\s*>', u'\n', val)

        # 去掉多余的标签
        val = re.sub(ur'<[^<>]*?>', u'', val)

        return val
Example #20
0
def fetch_stores(data):
    url = data["host"] + data["country_url"] % data["country_id"]
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump("Error in fetching countries: %s" % url, log_name)
        return []

    raw = json.loads(body)["rawPos"]
    store_list = []
    for s in raw:
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        addr_list = []
        for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ["address%d" % v for v in xrange(1, 5)]]:
            if tmp2 != "":
                addr_list.append(tmp2)
        entry[cm.addr_e] = ", ".join(addr_list)
        entry[cm.city_e] = cm.extract_city(s["city"]["name"])[0]
        entry[cm.country_e] = s["country"]["countryCode"]
        entry[cm.email] = s["email"]
        entry[cm.fax] = s["fax"]
        if s["latitude"] != "":
            entry[cm.lat] = string.atof(s["latitude"])
        if s["longitude"] != "":
            entry[cm.lng] = string.atof(s["longitude"])
        entry[cm.hours] = cm.reformat_addr(s["openingSchedule"])
        phone_list = []
        for key in ["phone1", "phone2"]:
            if s[key].strip() != "":
                phone_list.append(s[key].strip())
        entry[cm.tel] = ", ".join(phone_list)
        entry[cm.zip_code] = s["postalCode"]
        entry[cm.name_e] = s["shopName"]
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump(
            "(%s / %d) Found store: %s, %s (%s, %s)"
            % (
                data["brandname_e"],
                data["brand_id"],
                entry[cm.name_e],
                entry[cm.addr_e],
                entry[cm.country_e],
                entry[cm.continent_e],
            ),
            log_name,
        )
        db.insert_record(entry, "stores")
        store_list.append(entry)

    return store_list
Example #21
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    m = re.search(ur'var\s+retailers\s*=\s*', body)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    end = body.find(u']', m.end())
    if end == -1:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    pat = re.compile(ur'[\{,]([a-zA-Z_\d]+):')

    store_list = []
    for s in json.loads(
            re.sub(re.compile(ur'([\{,])([a-zA-Z_\d]+):'), ur'\1"\2":',
                   body[m.end():end + 1])):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        name_list = []
        for tmp in ['name', 'name_line_2']:
            if tmp in s and s[tmp] is not None and cm.html2plain(
                    s[tmp]).strip() != '':
                name_list.append(cm.html2plain(s[tmp]).strip())
        entry[cm.name_e] = ', '.join(name_list)
        addr_list = []
        for tmp in ['address', 'address_line_2']:
            if tmp in s and s[tmp] is not None and cm.html2plain(
                    s[tmp]).strip() != '':
                addr_list.append(cm.html2plain(s[tmp]).strip())
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.country_e] = s['country'].strip().upper()
        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        region = cm.html2plain(s['region'])
        if re.search(
                ur'\d+',
                region) is None and '&' not in region and ';' not in region:
            entry[cm.province_e] = region.strip().upper()
Example #22
0
def fetch_stores(data):
    # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata
    url = data['data_url']
    param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02',
             'undercolor': ' 06', 'togetmap': 'mapdata'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False)
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'<marker (.+?)>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'name=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', ''))
        m1 = re.search(ur'address=\\"(.+?)\\"', m)
        if m1 is not None:
            addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', ''))
            tel = cm.extract_tel(addr)
            if tel != '':
                entry[cm.tel] = tel
                addr = addr.replace(tel, '')
            entry[cm.addr_e] = cm.reformat_addr(addr)

        m1 = re.search(ur'lat=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))

        m1 = re.search(ur'lng=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data['country'].strip().upper()
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'benetton_log.txt', False)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Example #23
0
def fetch_cities(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, log_name)
        return []

    m = re.search(ur'Choose a city', body)
    if m is None:
        cm.dump('Error in fetching cities: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0]
    results = []
    for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body):
        d = data.copy()
        d['url'] = data['host'] + cm.html2plain(m[0])
        d['city'] = cm.html2plain(m[1]).strip().upper()
        results.append(d)
Example #24
0
def fetch_cities(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, log_name)
        return []

    m = re.search(ur'Choose a city', body)
    if m is None:
        cm.dump('Error in fetching cities: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0]
    results = []
    for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body):
        d = data.copy()
        d['url'] = data['host'] + cm.html2plain(m[0])
        d['city'] = cm.html2plain(m[1]).strip().upper()
        results.append(d)
Example #25
0
def proc_store(sub, data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    entry[cm.country_e] = data['country']
    m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>',
                   sub)
    if m1 is not None:
        entry[cm.store_class] = m1.group(1).strip()

    m1 = re.search(ur'<span itemprop="address"', sub)
    if m1 is not None:
        addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b',
                                      ur'</span>')[0]
        m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>',
                       addr_sub, re.S)
        if m2 is not None:
            entry[cm.zip_code] = m2.group(1).strip()
        m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub,
                       re.S)
        if m2 is not None:
            entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr_sub)

    m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S)
    if m2 is not None:
        entry[cm.tel] = m2.group(1).strip()

    m2 = re.search(ur'Fax\b(.+?)</p>', sub)
    if m2 is not None:
        entry[cm.fax] = cm.extract_tel(m2.group(1))

    m2 = re.search(
        ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>',
        sub)
    if m2 is not None:
        geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8'))
        param = {
            'brepairs': True,
            'restrictedtemplate': 2,
            'bretailers': True,
            'bshops': True,
            'brepairs': True
        }
        try:
            geo_body = cm.get_data(geo_url, param)
            m3 = re.search(
                ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)',
                geo_body)
            if m3 is not None:
                entry[cm.lat] = string.atof(m3.group(1))
                entry[cm.lng] = string.atof(m3.group(2))
        except Exception, e:
            cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param),
                    log_name)
Example #26
0
def fetch_stores(data):
    url = data['url']
    param = {
        'action': 'getStoresByCity',
        'idCity': data['city_id'],
        'filter': 'clothing;lacoste%20l!ve'
    }
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    raw = json.loads(body)['root']['DATA']['stores']
    store_list = []
    for s in [tmp['store'] for tmp in raw]:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.name_e] = s['name'].strip()
        entry[cm.country_e] = data['country_code']
        entry[cm.addr_e] = cm.html2plain(s['address']).strip()
        entry[cm.store_type] = s['category'].strip()
        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        if s['email'] is not None:
            entry[cm.email] = s['email'].strip()
        if s['fax'] is not None:
            entry[cm.fax] = s['fax'].strip()
        if s['infoHours'] is not None:
            entry[cm.hours] = s['infoHours'].strip()
        if s['latitude'] is not None and s['latitude'].strip() != '':
            entry[cm.lat] = string.atof(s['latitude'])
        if s['longitude'] is not None and s['longitude'].strip() != '':
            entry[cm.lat] = string.atof(s['longitude'])
        if s['phone'] is not None:
            entry[cm.tel] = s['phone'].strip()
        if s['postCode'] is not None:
            entry[cm.zip_code] = s['postCode'].strip()

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
    return store_list
Example #27
0
def fetch_store_list(data, logger=None):
    if not logger:
        logger = logging.getLogger()

    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []

    body = pq(body)('div.store-country #shop-list')

    start = body.find(ur"<div class='store-country'>")
    if start == -1:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    start_stores = body.find(
        ur'<h3><a href="/store-locator/index">Stores</a></h3>')
    start_outlets = body.find(ur"<h3 class='outlets'>")
    store_sub = body[start_stores:start_outlets]
    outlet_sub = body[start_outlets:]

    results = []
    for m1 in re.finditer(
            ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>',
            store_sub):
        country_id = string.atoi(m1.group(1))
        country = m1.group(2).strip()
        sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0]
        for m2 in re.finditer(
                ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>',
                sub1):
            city_id = string.atoi(m2.group(1))
            city = m2.group(2).strip()
            sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0]
            for m3 in re.finditer(
                    ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>',
                    sub2):
                d = data.copy()
                d['country_id'] = country_id
                d['country'] = country
                d['city_id'] = city_id
                d['city'] = city
                d['url'] = m3.group(1).strip()
                d['store_id'] = string.atoi(m3.group(2))
                d['store'] = cm.html2plain(m3.group(3).strip())
                # d['store_type'] = 'store'
                results.append(d)
Example #28
0
def fetch_stores(data):
    url = data['post_shops']
    param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0,
             'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0}
    try:
        html = cm.post_data(url, param)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    try:
        for store in (pq(tmp) for tmp in pq(html)('ul')):
            try:
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip()
                entry[cm.country_e] = data[cm.country_e]
                entry[cm.city_e] = data[cm.city_e]

                addr_list = []
                for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')):
                    if term != '':
                        addr_list.append(term)
                tel = cm.extract_tel(addr_list[-1])
                if tel != '':
                    entry[cm.tel] = tel
                    del addr_list[-1]
                entry[cm.addr_e] = ', '.join(addr_list)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e])
                if ret[0] is not None and entry[cm.country_e] == '':
                    entry[cm.country_e] = ret[0]
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
                print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')
            except (IndexError, TypeError) as e:
                cm.dump(u'Error in parsing %s, %s' % (url, param), log_name)
                print traceback.format_exc()
                continue
    except Exception, e:
        print traceback.format_exc()
Example #29
0
def fetch_store_list(url):
    """
    获得门店的列表
    :rtype : 门店列表。格式:[{'name':**, 'lat':**, 'lng':**, 'type':**, 'url':**}]
    :param url: 
    """
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    # 开始解析工作
    # 查找数据部分,位于var items和var\s\w+之间
    start = html.find('var items')
    if start == -1:
        return {}
    start += len('var items')
    end = html.find('var ', start)
    html = html[start:end]
    stores = []

    pattern = ur'\[(.+?)\]'
    store_list = []
    for m in re.findall(pattern, html, re.S):
        store_entry = {}
        m_list = re.findall(ur"'(.*)'", m)
        try:
            store_entry['name'] = cm.html2plain(m_list[0].strip())
            store_entry['type'] = m_list[2].strip()
            store_entry['url'] = m_list[4].strip()
        except IndexError:
            print 'Index error: %s' % m
            # 去掉引号之间的内容,准备查找经纬度信息
        m_list = re.findall(ur'(-?\d+\.\d+)', re.subn(ur"'(.*)'", '', m)[0])
        try:
            lat = string.atof(m_list[0])
            lng = string.atof(m_list[1])
            store_entry['lat'] = lat
            store_entry['lng'] = lng
        except (IndexError, ValueError):
            print 'Index error in getting coordinates: %s' % m

        # test
        # if 'hong-kong' in store_entry['url'] or 'taichung' in store_entry['url']:
        if len(store_entry.keys()) > 0:
            store_list.append(store_entry)
    return store_list
Example #30
0
def fetch_stores(data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    code = data['country_code']
    if gs.look_up(code, 1) is None:
        entry[cm.country_e] = cm.html2plain(data['country']).strip().upper()
    else:
        entry[cm.country_e] = code
    entry[cm.name_e] = data['store_name']
    entry[cm.city_e] = cm.extract_city(data['city'])[0]
    entry[cm.lat] = data['lat'] if data['lat'] is not None else ''
    entry[cm.lng] = data['lng'] if data['lng'] is not None else ''

    m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content'])
    sub = data['content'][m.end():]

    m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub)
    if m1 is not None:
        entry[cm.store_class] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1))
        if m2:
            entry = fetch_details(data, m2.group(1), entry)

        m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S)
        if m2:
            ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(','))
            entry[cm.tel] = cm.extract_tel(ct_list[0])
            if len(ct_list) > 1:
                entry[cm.email] = ct_list[1].strip()

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')

    return tuple(entry)
Example #31
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    m = re.search(ur'var\s+retailers\s*=\s*', body)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    end = body.find(u']', m.end())
    if end == -1:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    pat = re.compile(ur'[\{,]([a-zA-Z_\d]+):')

    store_list = []
    for s in json.loads(re.sub(re.compile(ur'([\{,])([a-zA-Z_\d]+):'), ur'\1"\2":', body[m.end():end + 1])):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        name_list = []
        for tmp in ['name', 'name_line_2']:
            if tmp in s and s[tmp] is not None and cm.html2plain(s[tmp]).strip() != '':
                name_list.append(cm.html2plain(s[tmp]).strip())
        entry[cm.name_e] = ', '.join(name_list)
        addr_list = []
        for tmp in ['address', 'address_line_2']:
            if tmp in s and s[tmp] is not None and cm.html2plain(s[tmp]).strip() != '':
                addr_list.append(cm.html2plain(s[tmp]).strip())
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.country_e] = s['country'].strip().upper()
        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        region = cm.html2plain(s['region'])
        if re.search(ur'\d+', region) is None and '&' not in region and ';' not in region:
            entry[cm.province_e] = region.strip().upper()
Example #32
0
def reformat(text):
    """
    格式化字符串,将多余的空格、换行、制表符等合并
    """
    if text is None:
        return None
    text = cm.html2plain(text.strip())
    # <br/>换成换行符
    text = re.sub(ur'<\s*br\s*/?>', u'\r\n', text)
    # 去掉多余的标签
    text = re.sub(ur'<[^<>]*?>', u'', text)
    # # 换行转换
    text = re.sub('[\r\n]+', '\r', text)
    # text = re.subn(ur'(?:[\r\n])+', ', ', text)[0]
    return text
Example #33
0
def fetch_store_details(data):
    # http://maps.oasis-stores.com/index-v2.php?coutnryISO=GB&brand=oasis&lat=51.42014&lng=-0.20954
    url = data['store_url']
    code = data['country_code']
    city = data['city_e']

    try:
        html = cm.get_data(url, {'latitude': data['lat'], 'longitude': data['lng'], 'brand': 'oasis'})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(html)
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.name_e] = raw['name']
    addr_list = []
    for i in xrange(1, 4):
        tmp = cm.html2plain(raw['address%d' % i]).strip()
        if tmp!='':
            addr_list.append(tmp)
    entry[cm.addr_e] = ', '.join(addr_list)
    state = raw['countryRegion']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    state = raw['state']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    state = raw['county']
    if state is not None and state.strip() != '':
        entry[cm.province_e] = state.strip().upper()
    entry[cm.zip_code] = raw['postcode']
    entry[cm.country_e] = data['country_e']
    entry[cm.city_e] = cm.extract_city(data['city_e'])[0]
    entry[cm.lat] = string.atof(data['lat'])
    entry[cm.lng] = string.atof(data['lng'])
    entry[cm.tel] = raw['phone']
    entry[cm.email] = raw['email']
    tmp = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    entry[cm.hours] = ', '.join([raw[d + '_open_times'] for d in tmp])
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')

    return [entry]
Example #34
0
def fetch_stores(data):
    url = data['host'] + data['country_url'] % data['country_id']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []

    raw = json.loads(body)['rawPos']
    store_list = []
    for s in raw:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        addr_list = []
        for tmp2 in [cm.html2plain(s[tmp1]).strip() for tmp1 in ['address%d' % v for v in xrange(1, 5)]]:
            if tmp2 != '':
                addr_list.append(tmp2)
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.city_e] = cm.extract_city(s['city']['name'])[0]
        entry[cm.country_e] = s['country']['countryCode']
        entry[cm.email] = s['email']
        entry[cm.fax] = s['fax']
        if s['latitude'] != '':
            entry[cm.lat] = string.atof(s['latitude'])
        if s['longitude'] != '':
            entry[cm.lng] = string.atof(s['longitude'])
        entry[cm.hours] = cm.reformat_addr(s['openingSchedule'])
        phone_list = []
        for key in ['phone1', 'phone2']:
            if s[key].strip() != '':
                phone_list.append(s[key].strip())
        entry[cm.tel] = ', '.join(phone_list)
        entry[cm.zip_code] = s['postalCode']
        entry[cm.name_e] = s['shopName']
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Example #35
0
    def parse_product_list(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath(
                '//div[@id="elementsContainer"]/div[contains(@id,"item")]/a[@class="itemImage" and @href]'
        ):
            url = self.process_href(
                re.sub(r'\s', '', cm.html2plain(node._root.attrib['href'])),
                response.url)
            m = copy.deepcopy(metadata)
            yield Request(url=url,
                          meta={'userdata': m},
                          dont_filter=True,
                          callback=self.parse_product_details,
                          errback=self.onerr)
Example #36
0
def fetch_stores(data):
    url = data['url']
    param = {'action': 'getStoresByCity', 'idCity': data['city_id'],
             'filter': 'clothing;lacoste%20l!ve'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    raw = json.loads(body)['root']['DATA']['stores']
    store_list = []
    for s in [tmp['store'] for tmp in raw]:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = s['name'].strip()
        entry[cm.country_e] = data['country_code']
        entry[cm.addr_e] = cm.html2plain(s['address']).strip()
        entry[cm.store_type] = s['category'].strip()
        entry[cm.city_e] = cm.extract_city(s['city'])[0]
        if s['email'] is not None:
            entry[cm.email] = s['email'].strip()
        if s['fax'] is not None:
            entry[cm.fax] = s['fax'].strip()
        if s['infoHours'] is not None:
            entry[cm.hours] = s['infoHours'].strip()
        if s['latitude'] is not None and s['latitude'].strip() != '':
            entry[cm.lat] = string.atof(s['latitude'])
        if s['longitude'] is not None and s['longitude'].strip() != '':
            entry[cm.lat] = string.atof(s['longitude'])
        if s['phone'] is not None:
            entry[cm.tel] = s['phone'].strip()
        if s['postCode'] is not None:
            entry[cm.zip_code] = s['postCode'].strip()

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
    return store_list
Example #37
0
def fetch_states(data):
    print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'],
                                         data['country_e'])
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="state">Choose a state/provence</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    state_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        province_e = cm.html2plain(m[1]).strip().upper()
        if data['country_e'] == 'CHINA':
            # 去掉省中间的空格
            province_e = province_e.replace(' ', '')
        ret = gs.look_up(province_e, 2)
        if ret is not None:
            province_e = ret['name_e']
        d = data.copy()
        d['province_e'] = province_e
        d['url'] = data['host'] + m[0]
        state_list.append(d)

    return state_list
Example #38
0
def fetch_regions(data):
    url = data['location_url']
    try:
        body = cm.get_data(url, {'lang': data['lang'], 'country': data['country_id']})
    except Exception:
        cm.dump('Error in fetching regions: %s, %s' % (url, data['country']), 'tudor_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    results = []
    for item in pq(body.encode('utf-8'))('region[id!=""]'):
        d = data.copy()
        d['region_id'] = string.atoi(item.attrib['id'])
        tmp = cm.html2plain(item.attrib['name']).strip().upper()
        d['region_name'] = re.sub(ur'市$', '', re.sub(ur'省$', '', tmp).strip()).strip()
        results.append(d)
    return results
Example #39
0
def fetch_countries(data):
    url = data['country_url']
    param = {'myid': '400-all', 'idioma': 'in'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    results = []
    for c in json.loads(body):
        if c['title'].strip() == '':
            continue
        d = data.copy()
        d['country'] = cm.html2plain(c['title']).strip().upper()
        d['key'] = c['key']
        results.append(d)
    return results
Example #40
0
def fetch_regions(data):
    url = data['location_url']
    try:
        body = cm.get_data(url, {'lang': data['lang'], 'country': data['country_id']})
    except Exception:
        cm.dump('Error in fetching regions: %s, %s' % (url, data['country']), 'tudor_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    results = []
    for item in pq(body.encode('utf-8'))('region[id!=""]'):
        d = data.copy()
        d['region_id'] = string.atoi(item.attrib['id'])
        tmp = cm.html2plain(item.attrib['name']).strip().upper()
        d['region_name'] = re.sub(ur'市$', '', re.sub(ur'省$', '', tmp).strip()).strip()
        results.append(d)
    return results
Example #41
0
def fetch_countries(data):
    url = data['country_url']
    param = {'myid': '400-all', 'idioma': 'in'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    results = []
    for c in json.loads(body):
        if c['title'].strip() == '':
            continue
        d = data.copy()
        d['country'] = cm.html2plain(c['title']).strip().upper()
        d['key'] = c['key']
        results.append(d)
    return results
Example #42
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    start = body.find(ur'<div class="col first" itemprop="address"')
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])

    addr = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
    m = re.search(ur'<span itemprop="postalCode">([^<>]+)</span>', addr, re.S)
    if m is not None:
        entry[cm.zip_code] = m.group(1).strip()
    entry[cm.addr_e] = cm.reformat_addr(addr)

    start = body.find(ur'<div class="col" itemprop="contactPoints"')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
        m = re.search(ur'<span itemprop="telephone">([^<>]+)</span>', sub,
                      re.S)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'<span itemprop="faxNumber">([^<>]+)</span>', sub,
                      re.S)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()

    start = body.find(ur'<h2>opening hours</h2>')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<table\b', ur'</table>')[0]
        tmp = []
        for m in re.findall(ur'<td>(.+?)</td>', sub):
            tmp.append(cm.html2plain(m).strip())
        entry[cm.hours] = ' '.join(tmp)
Example #43
0
def fetch_cities(data):
    url = data['post_city']
    try:
        html = cm.post_data(url, {'country': data['country_id']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    city_list = []
    for item in pq(html)('#cities option[value!="0"]'):
        d = data.copy()
        city_e = cm.html2plain(item.text).strip().upper()
        ret = gs.look_up(city_e, 3)
        if ret is not None:
            city_e = ret['name_e']
        d['city_e'] = city_e
        city_list.append(d)

    return city_list
Example #44
0
def fetch_store_list(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []

    start = body.find(ur"<div class='store-country'>")
    if start == -1:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    start_stores = body.find(ur'<h3><a href="/store-locator/index">Stores</a></h3>')
    start_outlets = body.find(ur"<h3 class='outlets'>")
    store_sub = body[start_stores:start_outlets]
    outlet_sub = body[start_outlets:]

    results = []
    for m1 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>', store_sub):
        country_id = string.atoi(m1.group(1))
        country = m1.group(2).strip()
        sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0]
        for m2 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>', sub1):
            city_id = string.atoi(m2.group(1))
            city = m2.group(2).strip()
            sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0]
            for m3 in re.finditer(ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>',
                                  sub2):
                d = data.copy()
                d['country_id'] = country_id
                d['country'] = country
                d['city_id'] = city_id
                d['city'] = city
                d['url'] = m3.group(1).strip()
                d['store_id'] = string.atoi(m3.group(2))
                d['store'] = cm.html2plain(m3.group(3).strip())
                # d['store_type'] = 'store'
                results.append(d)
Example #45
0
def fetch_cities(data):
    url = data['data_url']
    param = {
        'countries': data['country_code'],
        'form_build_id': data['form_build_id'],
        'form_id': 'cartierfo_generic_store_locator_search_form',
        '_triggering_element_name': 'countries'
    }
    try:
        body, cookie = cm.post_data_cookie(url, param, cookie=data['cookie'])
    except Exception:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []

    if cookie is not None:
        data['cookie'] = cookie

    raw = json.loads(body)
    body = None
    for item in raw:
        if 'data' in item and item['data'] != '':
            body = item['data']
            break
    if body is None:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []

    # body = body.decode('unicode_escape')
    start = body.find(ur'<select id="edit-cities"')
    if start == -1:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0]

    results = []
    for m in re.findall(ur'<option.+?value="([^"]+?)".*?>.+?</option>', body):
        d = data.copy()
        d['city'] = cm.html2plain(m)
        results.append(d)
        print 'Country: %s, City: %s' % (data['country'], d['city'])
Example #46
0
    def parse(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        for node in sel.xpath(
                '//div[contains(@class,"main-menu")]//li[contains(@class,"level0")]'
        ):
            node_class = node._root.attrib['class']
            mt = re.search(r'\b(\w+)\s*$', node_class)
            if not mt:
                continue

            tag_type = 'category-0'
            tag_name = unicodify(mt.group(1)).lower()
            temp = node.xpath('./a[@href]')
            if not temp:
                continue
            href = temp[0]._root.attrib['href']
            tag_text = u', '.join([
                cm.html2plain(unicodify(val.text))
                for val in temp[0]._root.iterdescendants()
                if val.text and val.text.strip()
            ])

            m = copy.deepcopy(metadata)
            m['tags_mapping'][tag_type] = [{
                'name': tag_name,
                'title': tag_text
            }]
            gender = cm.guess_gender(tag_name)
            if gender:
                m['gender'] = [gender]

            if not href or not href.strip():
                continue
            else:
                yield Request(url=href,
                              meta={'userdata': m},
                              callback=self.parse_category_0)
Example #47
0
def fetch_states(data):
    print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e'])
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="state">Choose a state/provence</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    state_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        province_e = cm.html2plain(m[1]).strip().upper()
        if data['country_e'] == 'CHINA':
            # 去掉省中间的空格
            province_e = province_e.replace(' ', '')
        ret = gs.look_up(province_e, 2)
        if ret is not None:
            province_e=ret['name_e']
        d = data.copy()
        d['province_e'] = province_e
        d['url'] = data['host'] + m[0]
        state_list.append(d)

    return state_list
Example #48
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    country_list = []
    for item in pq(html)('#country option[value!="reset"]'):
        d = data.copy()
        d['country_id'] = string.atoi(item.attrib['value'])

        country_e = cm.html2plain(item.text).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e = ret['name_e']
        d['country_e'] = country_e
        country_list.append(d)
    return country_list
Example #49
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    start = body.find(ur'<div class="col first" itemprop="address"')
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

    addr = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
    m = re.search(ur'<span itemprop="postalCode">([^<>]+)</span>', addr, re.S)
    if m is not None:
        entry[cm.zip_code] = m.group(1).strip()
    entry[cm.addr_e] = cm.reformat_addr(addr)

    start = body.find(ur'<div class="col" itemprop="contactPoints"')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
        m = re.search(ur'<span itemprop="telephone">([^<>]+)</span>', sub, re.S)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'<span itemprop="faxNumber">([^<>]+)</span>', sub, re.S)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()

    start = body.find(ur'<h2>opening hours</h2>')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<table\b', ur'</table>')[0]
        tmp = []
        for m in re.findall(ur'<td>(.+?)</td>', sub):
            tmp.append(cm.html2plain(m).strip())
        entry[cm.hours] = ' '.join(tmp)
Example #50
0
def proc_store(sub, data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.country_e] = data['country']
    m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub)
    if m1 is not None:
        entry[cm.store_class] = m1.group(1).strip()

    m1 = re.search(ur'<span itemprop="address"', sub)
    if m1 is not None:
        addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0]
        m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S)
        if m2 is not None:
            entry[cm.zip_code] = m2.group(1).strip()
        m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S)
        if m2 is not None:
            entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr_sub)

    m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S)
    if m2 is not None:
        entry[cm.tel] = m2.group(1).strip()

    m2 = re.search(ur'Fax\b(.+?)</p>', sub)
    if m2 is not None:
        entry[cm.fax] = cm.extract_tel(m2.group(1))

    m2 = re.search(ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub)
    if m2 is not None:
        geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8'))
        param = {'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True}
        try:
            geo_body = cm.get_data(geo_url, param)
            m3 = re.search(ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body)
            if m3 is not None:
                entry[cm.lat] = string.atof(m3.group(1))
                entry[cm.lng] = string.atof(m3.group(2))
        except Exception, e:
            cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
Example #51
0
def fetch_cities(data):
    url = data['data_url']
    param = {'countries': data['country_code'],
             'form_build_id': data['form_build_id'],
             'form_id': 'cartierfo_generic_store_locator_search_form',
             '_triggering_element_name': 'countries'}
    try:
        body, cookie = cm.post_data_cookie(url, param, cookie=data['cookie'])
    except Exception:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []

    if cookie is not None:
        data['cookie'] = cookie

    raw = json.loads(body)
    body = None
    for item in raw:
        if 'data' in item and item['data'] != '':
            body = item['data']
            break
    if body is None:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []

    # body = body.decode('unicode_escape')
    start = body.find(ur'<select id="edit-cities"')
    if start == -1:
        cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0]

    results = []
    for m in re.findall(ur'<option.+?value="([^"]+?)".*?>.+?</option>', body):
        d = data.copy()
        d['city'] = cm.html2plain(m)
        results.append(d)
        print 'Country: %s, City: %s' % (data['country'], d['city'])
Example #52
0
def fetch_countries(data):
    url = data['url']
    param = {'lang': 'en_GB'}
    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    results = []
    for item in json.loads(body):
        if item is None:
            continue
        d = data.copy()
        d['country_code'] = item['isocode']
        d['country'] = cm.html2plain(item['Translation']['en_GB']['label']).strip().upper()
        d['country_id'] = item['id']
        results.append(d)
    return results


tot_processed = 0


def fetch_stores(data):
    store_list = []

    global tot_processed
    tot_processed += 1
    cm.dump('Processint city #%d' % tot_processed, log_name)
Example #53
0
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump("Error in fetching stores: %s, %s" % (url, param), log_name)
        return ()

    tree = et.fromstring(body.encode("utf-8"))
    store_list = []
    for store in tree.iter("poi"):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        val = store.getiterator("uid")[0].text
        if val in store_map:
            continue
        store_map[val] = entry

        val = store.getiterator("name")[0].text
        entry[cm.name_e] = cm.html2plain(val).strip() if val else ""

        addr_list = []
        for idx in xrange(1, 3):
            val = store.getiterator("address%d" % idx)[0].text
            if val:
                val = cm.reformat_addr(val)
                if val != "":
                    addr_list.append(val)
        entry[cm.addr_e] = ", ".join(addr_list)

        val = store.getiterator("city")[0].text
        entry[cm.city_e] = cm.extract_city(val)[0] if val else ""
        val = store.getiterator("province")[0].text
        entry[cm.province_e] = cm.html2plain(val).strip().upper() if val else ""
        if entry[cm.province_e] == "":
Example #54
0
    try:
        body = cm.get_data(url, {'display_country': 'CN'})
    except Exception, e:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return ()

    m = re.search(ur'<select class="country-selector[^<>]+>(.+?)</select>', body, re.S)
    if m is None:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return ()
    sub = m.group(1)
    results = []
    for m in re.findall(ur'<option value="([A-Z]{2})"[^<>]+>([^<>]+)</option>', sub):
        d = data.copy()
        d['country_code'] = m[0]
        d['country'] = cm.html2plain(m[1]).strip().upper()
        # if m[0] == 'US':
        results.append(d)
    return tuple(results)


def fetch_store_detail(data):
    url = data['data_url']
    param = {'format': 'JSON', 'location_id': data['store_id'], 'type': 'location'}
    try:
        s = json.loads(cm.get_data(url, param))['locations'][0]
    except Exception, e:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return ()

    store_id = data['store_id']