Example #1
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'var markerContent\s*?=\s*?"(.+?)".+?'
                        ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)', html, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        lat, lng = map(string.atof, [m[1], m[2]])
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

        sub = m[0].strip()
        m1 = re.search(ur'<b>(.+?)</b>', sub)
        if m1 is None:
            continue
        entry[cm.name_c] = m1.group(1)
        sub = sub.replace(m1.group(0), '')
        m1=re.search(ur'聯系電話(?::|:)(.+?)<', sub)
        if m1 is not None:
            entry[cm.tel]=m1.group(1)
            sub=sub.replace(m1.group(0), '<')
        sub = re.sub(ur'<img\b.*?/>', '', sub)
        entry[cm.addr_c] = cm.reformat_addr(sub)

        print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_c], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #2
0
def fetch_cn(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = html.find('arrData = [')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'\[', ur'\]')
    raw_list = json.loads(sub)

    store_list = []
    for v1 in raw_list:
        # 省
        province = v1[0].strip()
        for v2 in v1[1]:
            # 市
            city = v2[0].strip()
            for v3 in v2[1]:
                # 商店
                entry = cm.init_store_entry(data['brand_id'],
                                            data['brandname_e'],
                                            data['brandname_c'])
                terms = v3.split(';')
                if len(terms) < 2:
                    continue
                entry['name_c'] = terms[0].strip()
                entry['addr_e'] = terms[1].strip()
                cm.update_entry(
                    entry, {
                        cm.city_c: city,
                        cm.province_c: province,
                        cm.country_c: u'中国',
                        cm.country_e: u'CHINA',
                        cm.continent_c: u'亚洲',
                        cm.continent_e: u'ASIA'
                    })

                print '(%s/%d) Found store: %s, %s (%s, %s, %s)' % (
                    data['brandname_e'], data['brand_id'], entry[cm.name_c],
                    entry[cm.addr_e], entry[cm.city_e], entry[cm.country_e],
                    entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')

    return store_list
Example #3
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
Example #4
0
def fetch(level=1, data=None, user='******', passwd=''):
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    js = json.loads(html)
    store_list = []
    for s in js['data']['list']:
        entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(
            entry, {
                cm.lat: string.atof(s['geo']['lat']),
                cm.lng: string.atof(s['geo']['lng'])
            })
        entry[cm.name_e] = s['contact']['title']
        entry[cm.addr_e] = cm.reformat_addr(s['contact']['address'])
        entry[cm.tel] = s['contact']['phone']
        entry[cm.fax] = s['contact']['fax']
        entry[cm.hours] = cm.reformat_addr(s['contact']['hours'])
        entry[cm.store_type] = s['contact']['selling']
        entry[cm.url] = host + s['link']

        gs.update_city_map(s['city'], s['country'], s['continent'])
        cm.update_entry(
            entry, {
                cm.continent_e: s['continent'],
                cm.country_e: s['country'],
                cm.city_e: s['city']
            })
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e],
            entry[cm.country_e], entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
    gs.commit_maps(1)
    gs.commit_maps(3)
    return store_list
Example #5
0
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(store_entry,
                        {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(),
                         cm.country_e: opt[cm.country_e].strip().upper(),
                         cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour,
                         cm.tel: store_tel})
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e],
            store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
Example #6
0
def fetch_stores(data):
    """
    获得门店的详细信息
    :rtype : [entries]
    :param data:
    """
    try:
        html = cm.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entries = []
    start = html.find(u'<ul class="store-list">')
    if start == -1:
        return entries
    start += len(u'<ul class="store-list">')
    end = html.find(u'</ul>', start)
    html = html[start:end]

    for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S):
        store = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        store[cm.store_type] = m1[0]
        sub_html = m1[1]
        m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html)
        if len(m2) > 0:
            store[cm.name_e] = cm.reformat_addr(m2[0])
        m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S)
        if len(m2) > 0:
            store[cm.addr_e] = cm.reformat_addr(m2[0])

        cm.update_entry(store, {cm.continent_e: data[cm.continent_e].strip().upper(),
                                cm.country_e: data[cm.country_e].strip().upper(),
                                cm.city_e: data[cm.city_e].strip().upper()})

        entry = store
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store[cm.name_e], store[cm.addr_e], store[cm.country_e],
            store[cm.continent_e])
        db.insert_record(store, 'stores')
        entries.append(store)
Example #7
0
def fetch_details(data):
    url = data[cm.url]
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.name_e] = data[cm.name_e]
    start = html.find(ur'<div class="field-address">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>')
    if end == 0:
        return []
    m1 = re.search(ur'<div  class="locality">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
    m1 = re.search(ur'<div  class="postal-code">(.+?)</div>', sub)
    if m1 is not None:
        entry[cm.zip_code] = m1.group(1).strip()
    entry[cm.country_e] = data[cm.country_e]
    pat = re.compile(ur'<[^<>]+?>', re.S)
    entry[cm.addr_e] = cm.reformat_addr(re.sub(pat, u'\r\n', sub))

    m1 = re.search(ur'<div class="field-telephone"><a href=".+?" class="tel">(.+?)</a></div>', html)
    if m1 is not None:
        entry[cm.tel] = m1.group(1).strip()

    m1 = re.search(ur'<div class="field-opening-hours">\s*<p>(.+?)</p>\s*</div>', html, re.S)
    if m1 is not None:
        entry[cm.hours] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'"coordinates":\[(-?\d+\.\d{4,})\s*,\s*(-?\d+\.\d{4,})\]', html)
    if m1 is not None:
        lat = string.atof(m1.group(1))
        lng = string.atof(m1.group(2))
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

    entry[cm.continent_e] = data[cm.continent_e]
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
Example #8
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.post_data(url, {'country': data['country_id'], 'city': data['city_id'], 'recordid': data['store_id']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    info = json.loads(html)['elements']
    addr = cm.reformat_addr(info['address'].replace('\\', '').replace('<p>', ',').replace('</p>', ','))
    # 第一行为商店名称
    terms = addr.split(',')
    if len(terms) > 0:
        entry[cm.name_e] = cm.reformat_addr(terms[0])
    entry[cm.addr_e] = addr

    gmap_url = info['gmap']
    m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    entry[cm.url] = info['shareurl'].replace('\\', '')
    entry[cm.hours] = info['openingtimes']
    entry[cm.comments] = info['other']

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])

    db.insert_record(entry, 'stores')
    return entry
Example #9
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(
            ur'var markerContent\s*?=\s*?"(.+?)".+?'
            ur'createMarker\(.+?new google.maps.LatLng\((-?\d+\.\d+),(-?\d+\.\d+)\)',
            html, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        lat, lng = map(string.atof, [m[1], m[2]])
        cm.update_entry(entry, {cm.lat: lat, cm.lng: lng})

        sub = m[0].strip()
        m1 = re.search(ur'<b>(.+?)</b>', sub)
        if m1 is None:
            continue
        entry[cm.name_c] = m1.group(1)
        sub = sub.replace(m1.group(0), '')
        m1 = re.search(ur'聯系電話(?::|:)(.+?)<', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1)
            sub = sub.replace(m1.group(0), '<')
        sub = re.sub(ur'<img\b.*?/>', '', sub)
        entry[cm.addr_c] = cm.reformat_addr(sub)

        print '(%s/%d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_c],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #10
0
def fetch(level=1, data=None, user='******', passwd=''):
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    js = json.loads(html)
    store_list = []
    for s in js['data']['list']:
        entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(entry, {cm.lat: string.atof(s['geo']['lat']),
                                cm.lng: string.atof(s['geo']['lng'])})
        entry[cm.name_e] = s['contact']['title']
        entry[cm.addr_e] = cm.reformat_addr(s['contact']['address'])
        entry[cm.tel] = s['contact']['phone']
        entry[cm.fax] = s['contact']['fax']
        entry[cm.hours] = cm.reformat_addr(s['contact']['hours'])
        entry[cm.store_type]=s['contact']['selling']
        entry[cm.url]=host+s['link']

        gs.update_city_map(s['city'], s['country'], s['continent'])
        cm.update_entry(entry,{cm.continent_e:s['continent'], cm.country_e:s['country'],
                               cm.city_e:s['city']})
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
            entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
    gs.commit_maps(1)
    gs.commit_maps(3)
    return store_list
Example #11
0
def fetch_cn(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = html.find('arrData = [')
    if start == -1:
        return []
    sub, start, end=cm.extract_closure(html[start:], ur'\[', ur'\]')
    raw_list=json.loads(sub)

    store_list=[]
    for v1 in raw_list:
        # 省
        province = v1[0].strip()
        for v2 in v1[1]:
            # 市
            city = v2[0].strip()
            for v3 in v2[1]:
                # 商店
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                terms=v3.split(';')
                if len(terms)<2:
                    continue
                entry['name_c']=terms[0].strip()
                entry['addr_e']=terms[1].strip()
                cm.update_entry(entry, {cm.city_c:city, cm.province_c:province, cm.country_c:u'中国',
                                        cm.country_e:u'CHINA', cm.continent_c:u'亚洲', cm.continent_e:u'ASIA'})

                print '(%s/%d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_c], entry[cm.addr_e], entry[cm.city_e],
                                                          entry[cm.country_e], entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')

    return store_list
Example #12
0
def get_stores(url, data):
    """
    从json对象中获得商店信息
    """
    opener = urllib2.build_opener()
    opener.addheaders = [
        ("User-Agent",
         "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)"
         "Chrome/27.0.1453.94 Safari/537.36"), ('Accept', '*/*'),
        ('X-Requested-With', 'XMLHttpRequest'), ('Connection', 'keep-alive')
    ]
    response = opener.open(url)
    html = response.read().encode('utf-8')
    jsonobj = json.loads(html)
    stores = jsonobj[u'Stores'][u'Items']
    region_list = jsonobj['Regions']
    region_id = jsonobj['Region']
    region = ''
    if len(region_list) > 0 and region_id != 0:
        for val in region_list:
            if val['RegionId'] == region_id:
                region = val['Name']
                break

    country = jsonobj['CurrentCountry']['Name']
    store_list = []
    for s in stores:
        # print('Found store: %s, %s. Tel: %s, lat=%s, lng=%s' % (
        #     s['Name'], s['Address'], s['Phone'], s['Latitude'], s['Longitude']))

        store_type = ['']
        # Some stores may have varioius store types
        if len(s['StoreTypes']) > 0:
            store_type = list(val['Name'] for val in s['StoreTypes'])
        if s['Url'] is not None:
            url = s['Url']
        else:
            url = ''
        if s['ZipCode'] is not None and not s['ZipCode'].__eq__(''):
            zip = s['ZipCode']
        else:
            zip = ''
        local_addr = s['Address']
        if local_addr[-1] == '.':
            local_addr = local_addr[:-1]
        if not zip.__eq__(''):
            addr = u'%s, %s, %s' % (local_addr, s['City'], zip)
        else:
            addr = u'%s, %s' % (local_addr, s['City'])
        if region.__eq__(''):
            addr = u'%s, %s' % (addr, country)
        else:
            addr = u'%s, %s, %s' % (addr, region, country)

        for t in store_type:
            entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
            cm.update_entry(
                entry, {
                    'addr_e': addr,
                    'country_e': country,
                    'city_e': s['City'],
                    'comments': s['Comments'],
                    'province_e': region.strip().upper(),
                    'zip': zip,
                    'email': s['Email'],
                    'fax': s['Fax'],
                    'lat': s['Latitude'],
                    'lng': s['Longitude'],
                    'name_e': s['Name'],
                    'tel': s['Phone'],
                    'store_type': t,
                    'url': url
                })
            gs.field_sense(entry)

            print '%s Found store: %s, %s (%s, %s)' % (
                brandname_e, entry[cm.name_e], entry[cm.addr_e],
                entry[cm.country_e], entry[cm.continent_e])
            db.insert_record(entry, 'stores')
            store_list.append(entry)

    return store_list
Example #13
0
def get_stores(data):
    url = data['url']
    print 'Trying to get stores for %s' % data['name']
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    start = 0
    store_list = []
    while True:
        start = html.find('<li class="info-store clearfix">', start)
        if start == -1:
            break
        end = html.find('<li class="info-store clearfix">', start + 1)
        sub_html = html[start:end]
        start = end

        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html):
            entry[common.url] = host + m[0]
            entry[common.name_e] = common.html2plain(m[1].strip())
            break

        for m in re.findall(
                r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>',
                sub_html):
            common.update_entry(entry, {
                common.lat: string.atof(m[0]),
                common.lng: string.atof(m[1])
            })
            break

        for m in re.findall(r'<span class="map-address">(.*?)</span>',
                            sub_html):
            entry[common.addr_e] = common.reformat_addr(m)
            break

        for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />',
                            sub_html):
            entry[common.tel] = m.strip()
            break

        for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">',
                            sub_html):
            entry[common.email] = m.strip()
            break

        opening_s = sub_html.find('<ul class="opening-hours')
        if opening_s != -1:
            opening_e = sub_html.find('</ul>', opening_s)
            o_str = sub_html[opening_s:opening_e]
            entry[common.hours] = ', '.join(
                [m for m in re.findall(r'<li>(.+?)</li>', o_str)])

        brand_s = sub_html.find('<ul class="brands clearfix">')
        if brand_s != -1:
            brand_e = sub_html.find('</ul>', brand_s)
            b_str = sub_html[brand_s:brand_e]
            entry[common.store_type] = ', '.join([
                common.html2plain(m)
                for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str)
            ])

        # Geo
        if 'state' in data:
            entry[common.province_e] = data['state']
        country_e = data['name'].strip().upper()
        entry[common.country_e] = country_e
        gs.field_sense(entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_e], entry[common.addr_e],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Example #14
0
def fetch(level=1, data=None, user='******', passwd=''):
    """

    :param level:
    :param data:
    :param user:
    :param passwd:
    :return:
    """
    try:
        if data is None:
            data = {'url': url}
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured in getting data: %s' % url
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'data': url
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    sub_pat = re.compile(ur'<!--.*?-->', re.S)
    html = re.sub(sub_pat, '', html)
    split_pos = [
        m.start()
        for m in re.finditer(ur'<p><span class="contactboldtitle">', html)
    ]
    split_pos.append(-1)
    sub_list = []
    for i in xrange(len(split_pos) - 1):
        sub_list.append(html[split_pos[i]:split_pos[i + 1]])

    store_list = []
    for sub_html in sub_list:
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>',
                       sub_html)
        if len(m) > 0:
            entry[common.name_l] = m[0]
        m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)',
                       sub_html, re.S)
        if len(m) >= 2:
            addr = common.reformat_addr(m[0])
            entry[common.addr_l] = addr
            # 城市,国家和邮编
            addr_splits = addr.split(', ')

            ret = gs.look_up(addr_splits[-1], 1)
            if ret is None:
                print 'Error in geo translating: %s' % addr_splits[-1]
            else:
                entry[common.country_e] = ret['name_e']
                m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2])
                if len(m1) > 0:
                    common.update_entry(
                        entry, {
                            common.city_e: common.extract_city(m1[0][0])[0],
                            common.zip_code: m1[0][1]
                        })

            # 联系方式
            tmp = m[1]
            m1 = re.findall(ur'[\d\-]{5,}', tmp)
            if len(m1) > 0:
                entry[common.tel] = m1[0]
            m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp)
            if len(m1) > 0:
                entry[common.email] = m1[0].strip()

        gs.field_sense(entry)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_l], entry[common.addr_l],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
Example #15
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    province_list = [{
        cm.province_c: m[1].strip().upper(),
        cm.url: m[0].strip()
    } for m in re.findall(
        ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)]

    comment_pat = re.compile(ur'<!--.*?-->', re.S)
    store_list = []

    for p in province_list:
        start = html.find('<div id="%s">' % p[cm.url])
        if start == -1:
            continue
        p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>',
                                               ur'</tbody>')
        p_sub = re.sub(comment_pat, '', p_sub)

        city_c = ''
        city_e = ''
        while True:
            s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>')
            if end == 0:
                break
            p_sub = p_sub[end:]
            if u'城市' in s_sub and u'店铺名称' in s_sub:
                continue

            term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub)
            if len(term_list) < 3:
                continue

            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])

            if len(term_list) == 4:
                city_c = term_list[0].strip()
                ret = gs.look_up(city_c, 3)
                if ret is not None:
                    city_e = ret['name_e']
                    city_c = ret['name_c']
                offset = 1
            else:
                offset = 0

            entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip()
            entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip()
            entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip()
            entry[cm.country_e] = 'CHINA'
            entry[cm.continent_e] = 'ASIA'

            p_name_c = p[cm.province_c]
            p_name_e = ''
            ret = gs.look_up(p_name_c, 2)
            if ret is not None:
                p_name_c = ret['name_c']
                p_name_e = ret['name_e']
            cm.update_entry(
                entry, {
                    cm.province_e: p_name_e,
                    cm.province_c: p_name_c,
                    cm.city_e: city_e,
                    cm.city_c: city_c
                })
            entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (
                data['brandname_e'], data['brand_id'], entry[cm.name_e],
                entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
            store_list.append(entry)
            db.insert_record(entry, 'stores')

    return store_list
Example #16
0
        dump_data = {'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    # 可能有多个门店,拆分
    sub_html = []
    for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>', html):
        start = m.start() + len(m.group())
        end = html.find('</li>', start)
        sub_html.append(html[start:end])

    stores = []
    # 针对每个门店:
    for s in sub_html:
        entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(entry, {cm.url: url, cm.name_e: data['name'], cm.lat: data['lat'], cm.lng: data['lng'],
                                cm.store_type: data['type']})
        for m in re.findall(ur'<p class="boutique-info-cadre-titre">(.*?)</p>', s):
            if len(m.strip()) >= 0:
                entry[cm.store_type] = m.strip()
            break
        for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S):
            if len(m.strip()) == 0:
                break
            for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m):
                if len(m1.strip()) > 0:
                    entry[cm.tel] = m1.strip()
                break
            for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m):
                if len(m1.strip()) > 0:
                    entry[cm.fax] = m1.strip()
                break
Example #17
0
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.post_data(url, {'continent': data['continent'],
                                  'country': data['country'], 'city': data['city'],
                                  'send': 1, 'page': 0})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="shop">', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        sub, start, end = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<h3>\s*(.+?)\s*</h3>', sub, re.S)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1)

        m1 = re.search(ur'<p[^>]*>(.+?)</p>', sub, re.S)
        if m1 is not None:
            entry[cm.store_type] = re.sub(re.compile(ur'\s*\+\s*', re.S), ', ', m1.group(1).strip())

        addr_sub, start, end = cm.extract_closure(sub, ur'<ul\b', ur'</ul>')
        if end != 0:
            tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', addr_sub)
            addr_list = []

            if len(tmp) >= 3:
                entry[cm.tel] = tmp[-1].strip()
                del tmp[-1]

            for term in tmp:
                term = cm.html2plain(term).strip()
                if term != '':
                    addr_list.append(term)
            entry[cm.addr_e] = ', '.join(addr_list)

        start = sub.lower().find(ur'opening hours')
        if start != -1:
            opening_sub, start, end = cm.extract_closure(sub[start:], ur'<ul\b', ur'</ul>')
            tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', opening_sub)
            opening_list = []
            for term in tmp:
                term = cm.html2plain(term).strip()
                if term != '':
                    opening_list.append(term)
            entry[cm.hours] = ', '.join(opening_list)

        cm.update_entry(entry, {cm.continent_e: data['continent'].strip().upper(),
                                cm.country_e: data['country'].strip().upper()})
        entry[cm.city_e] = cm.extract_city(data['city'])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #18
0
def fetch_stores(data):
    """
    获得门店的详细信息
    :rtype : [entries]
    :param data:
    """
    try:
        html = cm.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entries = []
    start = html.find(u'<ul class="store-list">')
    if start == -1:
        return entries
    start += len(u'<ul class="store-list">')
    end = html.find(u'</ul>', start)
    html = html[start:end]

    for m1 in re.findall(ur'<li class="(.*?)">(.*?)</li>', html, re.S):
        store = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        store[cm.store_type] = m1[0]
        sub_html = m1[1]
        m2 = re.findall(ur'<h3 class="store-name">(.*?)</h3>', sub_html)
        if len(m2) > 0:
            store[cm.name_e] = cm.reformat_addr(m2[0])
        m2 = re.findall(ur'<p class="store-address">(.*?)</p>', sub_html, re.S)
        if len(m2) > 0:
            store[cm.addr_e] = cm.reformat_addr(m2[0])

        cm.update_entry(
            store, {
                cm.continent_e: data[cm.continent_e].strip().upper(),
                cm.country_e: data[cm.country_e].strip().upper(),
                cm.city_e: data[cm.city_e].strip().upper()
            })

        entry = store
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store[cm.name_e], store[cm.addr_e],
            store[cm.country_e], store[cm.continent_e])
        db.insert_record(store, 'stores')
        entries.append(store)
Example #19
0
    # 可能有多个门店,拆分
    sub_html = []
    for m in re.finditer(ur'<li\s+class\s*=\s*"boutique-info-cadre-\d+"\s*>',
                         html):
        start = m.start() + len(m.group())
        end = html.find('</li>', start)
        sub_html.append(html[start:end])

    stores = []
    # 针对每个门店:
    for s in sub_html:
        entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(
            entry, {
                cm.url: url,
                cm.name_e: data['name'],
                cm.lat: data['lat'],
                cm.lng: data['lng'],
                cm.store_type: data['type']
            })
        for m in re.findall(ur'<p class="boutique-info-cadre-titre">(.*?)</p>',
                            s):
            if len(m.strip()) >= 0:
                entry[cm.store_type] = m.strip()
            break
        for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s,
                            re.S):
            if len(m.strip()) == 0:
                break
            for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>',
                                 m):
                if len(m1.strip()) > 0:
Example #20
0
def field_sense(entry):
    # Geo
    country = entry[cm.country_e]
    city = entry[cm.city_e]
    ret = look_up(city, 3)
    ret1 = look_up(country, 1)
    if ret1 is not None:
        country = ret1['name_e']
    if ret is not None and ret['country']['name_e'] == country:
        entry[cm.city_e] = ret['name_e']
        entry[cm.city_c] = ret['name_c']

        prov = ret['province']
        if prov != '':
            ret1 = look_up(prov['name_e'], 2)
            if ret1 is not None:
                entry[cm.province_e] = ret1['name_e']
                entry[cm.province_c] = ret1['name_c']

    province = entry[cm.province_e]
    ret = look_up(province, 2)
    if ret is not None:
        entry[cm.province_e] = ret['name_e']
        entry[cm.province_c] = ret['name_c']

    ret = look_up(country, 1)
    if ret is not None:
        cm.update_entry(entry, {cm.country_e: ret['name_e'], cm.country_c: ret['name_c']})
        ret1 = look_up(ret['continent']['name_e'], 0)
        cm.update_entry(entry, {cm.continent_e: ret1['name_e'], cm.continent_c: ret1['name_c']})

        if entry[cm.zip_code] == '':
            m = None
            if ret['name_e'] == look_up(u'CHINA', 1)['name_e']:
                # 中国邮编
                m = re.match(ur'.*\b(\d{6})\b', entry[cm.addr_e])
            elif ret['name_e'] == look_up(u'UNITED STATES', 1)['name_e']:
                # 美国邮编
                m = re.match(ur'.*\b(\d{5})\b', entry[cm.addr_e])
            elif ret['name_e'] == look_up(u'JAPAN', 1)['name_e']:
                # 日本邮编
                m = re.match(ur'.*\b(\d{3}\-\d{4})\b', entry[cm.addr_e])
            if m is not None:
                entry[cm.zip_code] = m.group(1)

    cm.chn_check(entry)

    if entry[cm.zip_code] == '':
        # 数字和城市,州一起,可能为邮编
        m = re.match(ur'.*\s+(\d{5,})\b', entry[cm.addr_e])
        if m is not None:
            tmp = entry[cm.addr_e][m.end() + 1:]
            terms = re.findall(ur'\b(\S+?)\b', tmp)
            if len(terms) > 0:
                if look_up(terms[0], 2) is not None or look_up(terms[0], 3) is not None:
                    entry[cm.zip_code] = m.group(1)
            else:
                tmp = entry[cm.addr_e][m.end() - len(m.group(1)) - 1::-1]
                terms = re.findall(ur'\b(\S+?)\b', tmp)
                if len(terms) > 0:
                    if look_up(terms[0][::-1], 2) is not None or look_up(terms[0][::-1], 3) is not None:
                        entry[cm.zip_code] = m.group(1)
Example #21
0
def fetch(level=1, data=None, user='******', passwd=''):
    """

    :param level:
    :param data:
    :param user:
    :param passwd:
    :return:
    """
    try:
        if data is None:
            data = {'url': url}
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured in getting data: %s' % url
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))


    sub_pat = re.compile(ur'<!--.*?-->', re.S)
    html = re.sub(sub_pat, '', html)
    split_pos = [m.start() for m in re.finditer(ur'<p><span class="contactboldtitle">', html)]
    split_pos.append(-1)
    sub_list = []
    for i in xrange(len(split_pos) - 1):
        sub_list.append(html[split_pos[i]:split_pos[i + 1]])

    store_list = []
    for sub_html in sub_list:
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>', sub_html)
        if len(m) > 0:
            entry[common.name_l] = m[0]
        m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)', sub_html, re.S)
        if len(m) >= 2:
            addr = common.reformat_addr(m[0])
            entry[common.addr_l] = addr
            # 城市,国家和邮编
            addr_splits = addr.split(', ')

            ret = gs.look_up(addr_splits[-1], 1)
            if ret is None:
                print 'Error in geo translating: %s' % addr_splits[-1]
            else:
                entry[common.country_e]=ret['name_e']
                m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2])
                if len(m1) > 0:
                    common.update_entry(entry, {common.city_e: common.extract_city(m1[0][0])[0],
                                                common.zip_code: m1[0][1]})

            # 联系方式
            tmp = m[1]
            m1 = re.findall(ur'[\d\-]{5,}', tmp)
            if len(m1) > 0:
                entry[common.tel] = m1[0]
            m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp)
            if len(m1) > 0:
                entry[common.email] = m1[0].strip()

        gs.field_sense(entry)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_l], entry[common.addr_l], entry[common.country_e],
            entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
Example #22
0
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(
            store_entry, {
                cm.continent_e: opt[cm.continent_e].strip().upper(),
                cm.city_e: opt[cm.city_e].strip().upper(),
                cm.country_e: opt[cm.country_e].strip().upper(),
                cm.name_e: cm.name_e,
                cm.addr_e: store_addr,
                cm.store_type: store_type,
                cm.hours: store_hour,
                cm.tel: store_tel
            })
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e],
            store_entry[cm.country_e], store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
Example #23
0
def fetch_stores(data):
    url = data['url']
    try:
        html, cookie_map = cm.get_data_cookie(url)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    print 'SLEEPING>>>>'
    time.sleep(5)

    m = re.search(
        'http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}',
        html)
    if m is None:
        return []
    url = m.group(0)

    cookie_map_new = {}
    for key in cookie_map:
        if 'dwpersonalization_' in key or key == 'sr_token':
            continue
        cookie_map_new[key] = cookie_map[key]
    cookie_map_new['invited_visitor_22225'] = '1'
    cookie_map = cookie_map_new

    try:
        html = cm.post_data(url, {
            'dwfrm_storelocator_startaddress': 'kingman',
            'dwfrm_storelocator_maxDistance': 30.00,
            'dwfrm_storelocator_outlet': 'true',
            'dwfrm_storelocator_retail': 'true',
            'dwfrm_storelocator_optical': 'true',
            'dwfrm_storelocator_eyewear': 'true',
            'dwfrm_storelocator_apparel': 'true',
            'dwfrm_storelocator_attire': 'true',
            'dwfrm_storelocator_department': 'true',
            'dwfrm_storelocator_IsMensFootwear': 'true',
            'dwfrm_storelocator_IsRRR': 'true',
            'dwfrm_storelocator_IsRRNY': 'true',
            'dwfrm_storelocator_IsRRS': 'true',
            'dwfrm_storelocator_wholesale': 'true',
            'dwfrm_storelocator_bba': 'true',
            'dwfrm_storelocator_ba': 'true',
            'dwfrm_storelocator_search.x': 0,
            'dwfrm_storelocator_search.y': 0,
            'dwfrm_storelocator_countryCode': 'US',
            'dwfrm_storelocator_postalCode': '67068',
            'dwfrm_storelocator_distanceUnit': 'mi',
            'dwfrm_storelocator_long': -98.117208,
            'dwfrm_storelocator_lat': 37.647131,
        },
                            cookie=cookie_map)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="storeColumnOne">', html):
        sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.name_e] = m2.group(1).strip()

        addr_list = [
            m2 for m2 in re.findall(
                ur'<div class="adddressline">([^<>]+)</div>', sub)
        ]
        entry[cm.addr_e] = ', '.join(addr_list)

        m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub)
        if m2 is not None:
            tmp = cm.reformat_addr(m2.group(1))
            terms = re.split('[, ]+', tmp)
            if len(terms) < 3:
                entry[cm.addr_e] = tmp
            else:
                ret = gs.look_up(terms[0], 3)
                if ret is not None:
                    entry[cm.city_e] = ret['name_e']
                else:
                    entry[cm.city_e] = terms[0].strip().upper()

                ret = gs.look_up(terms[1], 2)
                if ret is not None:
                    entry[cm.province_e] = ret['name_e']
                else:
                    entry[cm.province_e] = terms[0].strip().upper()

                if re.match('\s*\d{5,}\s*', terms[2]) is not None:
                    entry[cm.zip_code] = terms[2].strip()

        m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.tel] = m2.group(1)

        cm.update_entry(entry, {
            'country_e': 'UNITED STATES',
            'continent_e': 'NORTH AMERICA'
        })
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #24
0
def get_stores(data):
    url = data["url"]
    print "Trying to get stores for %s" % data["name"]
    try:
        html = common.get_data(url)
    except Exception:
        print "Error occured: %s" % url
        dump_data = {"level": 1, "time": common.format_time(), "data": {"url": url}, "brand_id": brand_id}
        common.dump(dump_data)
        return []

    start = 0
    store_list = []
    while True:
        start = html.find('<li class="info-store clearfix">', start)
        if start == -1:
            break
        end = html.find('<li class="info-store clearfix">', start + 1)
        sub_html = html[start:end]
        start = end

        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        for m in re.findall(r'<h1><a href="(.*?)">(.*?)</a>', sub_html):
            entry[common.url] = host + m[0]
            entry[common.name_e] = common.html2plain(m[1].strip())
            break

        for m in re.findall(
            r'<span style="display:none" class="ll">\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)\s*</span>', sub_html
        ):
            common.update_entry(entry, {common.lat: string.atof(m[0]), common.lng: string.atof(m[1])})
            break

        for m in re.findall(r'<span class="map-address">(.*?)</span>', sub_html):
            entry[common.addr_e] = common.reformat_addr(m)
            break

        for m in re.findall(r'<span class="type">phone:</span>(.*?)<br />', sub_html):
            entry[common.tel] = m.strip()
            break

        for m in re.findall(r'<a class="email" href="mailto:(.*?@.*?)">', sub_html):
            entry[common.email] = m.strip()
            break

        opening_s = sub_html.find('<ul class="opening-hours')
        if opening_s != -1:
            opening_e = sub_html.find("</ul>", opening_s)
            o_str = sub_html[opening_s:opening_e]
            entry[common.hours] = ", ".join([m for m in re.findall(r"<li>(.+?)</li>", o_str)])

        brand_s = sub_html.find('<ul class="brands clearfix">')
        if brand_s != -1:
            brand_e = sub_html.find("</ul>", brand_s)
            b_str = sub_html[brand_s:brand_e]
            entry[common.store_type] = ", ".join(
                [common.html2plain(m) for m in re.findall(r'<li><a href=".*?">(.+?)</a></li>', b_str)]
            )

        # Geo
        if "state" in data:
            entry[common.province_e] = data["state"]
        country_e = data["name"].strip().upper()
        entry[common.country_e] = country_e
        gs.field_sense(entry)

        print "%s Found store: %s, %s (%s, %s)" % (
            brandname_e,
            entry[common.name_e],
            entry[common.addr_e],
            entry[common.country_e],
            entry[common.continent_e],
        )
        db.insert_record(entry, "stores")
        store_list.append(entry)

    return store_list
Example #25
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.post_data(
            url, {
                'country': data['country_id'],
                'city': data['city_id'],
                'recordid': data['store_id']
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    info = json.loads(html)['elements']
    addr = cm.reformat_addr(info['address'].replace('\\', '').replace(
        '<p>', ',').replace('</p>', ','))
    # 第一行为商店名称
    terms = addr.split(',')
    if len(terms) > 0:
        entry[cm.name_e] = cm.reformat_addr(terms[0])
    entry[cm.addr_e] = addr

    gmap_url = info['gmap']
    m = re.findall(ur'(-?\d+\.\d+),(-?\d+\.\d+)', gmap_url)
    if len(m) > 0:
        cm.update_entry(entry, {
            cm.lat: string.atof(m[0][0]),
            cm.lng: string.atof(m[0][1])
        })

    entry[cm.url] = info['shareurl'].replace('\\', '')
    entry[cm.hours] = info['openingtimes']
    entry[cm.comments] = info['other']

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e],
        entry[cm.country_e], entry[cm.continent_e])

    db.insert_record(entry, 'stores')
    return entry
Example #26
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(
        ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)',
        html)
    if len(m) > 0:
        cm.update_entry(entry, {
            cm.lat: string.atof(m[0][0]),
            cm.lng: string.atof(m[0][1])
        })

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list = [
            tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')
        ]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e],
        entry[cm.country_e], entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
Example #27
0
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'):
    tot = 0
    start = 0
    store_list = []
    data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100}
    # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100}

    db = cm.StoresDb()
    db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores')
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    while True:
        cm.dump('Fetching from %d' % start, 'triumph_log.txt')
        try:
            data['start'] = start
            html = cm.get_data(url, data)
            raw_list = json.loads(html)
            if tot == 0:
                tot = raw_list['response']['numFound']
                cm.dump('Found: %d' % tot, 'triumph_log.txt')
            raw_list = raw_list['response']['docs']
        except Exception:
            cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt')
            dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
            cm.dump(dump_data)
            return []

        idx = 0
        if len(raw_list) < data['rows'] and start + len(raw_list) < tot:
            cm.dump('Cooling down...', 'triumph_log.txt')
            time.sleep(5)
            continue

        for v in raw_list:
            entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
            cm.update_entry(entry, {cm.store_type: v['class'],
                                    cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'],
                                    cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours']})
            entry[cm.name_e] = cm.reformat_addr(v['name'])

            entry[cm.city_e], tmp = cm.extract_city(v['city'])
            if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '':
                entry[cm.zip_code] = tmp

            if v['location'] != '':
                terms = v['location'].split(',')
                cm.update_entry(entry, {cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1])})
            addr = v['address']
            if v['address2'] != '':
                addr += ', ' + v['address2']
            entry[cm.addr_e] = cm.reformat_addr(addr)
            ret = gs.look_up(v['country'], 1)
            if ret is not None:
                entry[cm.country_e] = ret['name_e']
            else:
                cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt')
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % (
                brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                entry[cm.country_e],
                entry[cm.continent_e]), 'triumph_log.txt')
            store_list.append(entry)
            db.insert_record(entry, 'stores')
            idx += 1

        if tot - start <= len(raw_list):
            break
        else:
            start += len(raw_list)
Example #28
0
def get_stores(url, data):
    """
    从json对象中获得商店信息
    """
    opener = urllib2.build_opener()
    opener.addheaders = [("User-Agent",
                          "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)"
                          "Chrome/27.0.1453.94 Safari/537.36"),
                         ('Accept', '*/*'), ('X-Requested-With', 'XMLHttpRequest'), ('Connection', 'keep-alive')]
    response = opener.open(url)
    html = response.read().encode('utf-8')
    jsonobj = json.loads(html)
    stores = jsonobj[u'Stores'][u'Items']
    region_list = jsonobj['Regions']
    region_id = jsonobj['Region']
    region = ''
    if len(region_list) > 0 and region_id != 0:
        for val in region_list:
            if val['RegionId'] == region_id:
                region = val['Name']
                break

    country = jsonobj['CurrentCountry']['Name']
    store_list = []
    for s in stores:
        # print('Found store: %s, %s. Tel: %s, lat=%s, lng=%s' % (
        #     s['Name'], s['Address'], s['Phone'], s['Latitude'], s['Longitude']))

        store_type = ['']
        # Some stores may have varioius store types
        if len(s['StoreTypes']) > 0:
            store_type = list(val['Name'] for val in s['StoreTypes'])
        if s['Url'] is not None:
            url = s['Url']
        else:
            url = ''
        if s['ZipCode'] is not None and not s['ZipCode'].__eq__(''):
            zip = s['ZipCode']
        else:
            zip = ''
        local_addr = s['Address']
        if local_addr[-1] == '.':
            local_addr = local_addr[:-1]
        if not zip.__eq__(''):
            addr = u'%s, %s, %s' % (local_addr, s['City'], zip)
        else:
            addr = u'%s, %s' % (local_addr, s['City'])
        if region.__eq__(''):
            addr = u'%s, %s' % (addr, country)
        else:
            addr = u'%s, %s, %s' % (addr, region, country)

        for t in store_type:
            entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
            cm.update_entry(entry, {'addr_e': addr, 'country_e': country, 'city_e': s['City'],
                                    'comments': s['Comments'],
                                    'province_e': region.strip().upper(), 'zip': zip,
                                    'email': s['Email'], 'fax': s['Fax'], 'lat': s['Latitude'], 'lng': s['Longitude'],
                                    'name_e': s['Name'], 'tel': s['Phone'], 'store_type': t, 'url': url})
            gs.field_sense(entry)

            print '%s Found store: %s, %s (%s, %s)' % (
                brandname_e, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                entry[cm.continent_e])
            db.insert_record(entry, 'stores')
            store_list.append(entry)

    return store_list
Example #29
0
def fetch_stores(data):
    url = data['url']
    try:
        html, cookie_map = cm.get_data_cookie(url)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    print 'SLEEPING>>>>'
    time.sleep(5)

    m = re.search('http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html)
    if m is None:
        return []
    url = m.group(0)

    cookie_map_new = {}
    for key in cookie_map:
        if 'dwpersonalization_' in key or key == 'sr_token':
            continue
        cookie_map_new[key] = cookie_map[key]
    cookie_map_new['invited_visitor_22225'] = '1'
    cookie_map = cookie_map_new


    try:
        html = cm.post_data(url, {'dwfrm_storelocator_startaddress': 'kingman',
                                         'dwfrm_storelocator_maxDistance': 30.00,
                                         'dwfrm_storelocator_outlet': 'true',
                                         'dwfrm_storelocator_retail': 'true',
                                         'dwfrm_storelocator_optical': 'true',
                                         'dwfrm_storelocator_eyewear': 'true',
                                         'dwfrm_storelocator_apparel': 'true',
                                         'dwfrm_storelocator_attire': 'true',
                                         'dwfrm_storelocator_department': 'true',
                                         'dwfrm_storelocator_IsMensFootwear': 'true',
                                         'dwfrm_storelocator_IsRRR': 'true',
                                         'dwfrm_storelocator_IsRRNY': 'true',
                                         'dwfrm_storelocator_IsRRS': 'true',
                                         'dwfrm_storelocator_wholesale': 'true',
                                         'dwfrm_storelocator_bba': 'true',
                                         'dwfrm_storelocator_ba': 'true',
                                         'dwfrm_storelocator_search.x': 0,
                                         'dwfrm_storelocator_search.y': 0,
                                         'dwfrm_storelocator_countryCode': 'US',
                                         'dwfrm_storelocator_postalCode': '67068',
                                         'dwfrm_storelocator_distanceUnit': 'mi',
                                         'dwfrm_storelocator_long': -98.117208,
                                         'dwfrm_storelocator_lat': 37.647131,}, cookie=cookie_map)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="storeColumnOne">', html):
        sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>')
        if end == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.name_e] = m2.group(1).strip()

        addr_list = [m2 for m2 in re.findall(ur'<div class="adddressline">([^<>]+)</div>', sub)]
        entry[cm.addr_e] = ', '.join(addr_list)

        m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub)
        if m2 is not None:
            tmp = cm.reformat_addr(m2.group(1))
            terms = re.split('[, ]+', tmp)
            if len(terms) < 3:
                entry[cm.addr_e] = tmp
            else:
                ret = gs.look_up(terms[0], 3)
                if ret is not None:
                    entry[cm.city_e] = ret['name_e']
                else:
                    entry[cm.city_e] = terms[0].strip().upper()

                ret = gs.look_up(terms[1], 2)
                if ret is not None:
                    entry[cm.province_e] = ret['name_e']
                else:
                    entry[cm.province_e] = terms[0].strip().upper()

                if re.match('\s*\d{5,}\s*', terms[2]) is not None:
                    entry[cm.zip_code] = terms[2].strip()

        m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.tel] = m2.group(1)

        cm.update_entry(entry, {'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA'})
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #30
0
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.post_data(
            url, {
                'continent': data['continent'],
                'country': data['country'],
                'city': data['city'],
                'send': 1,
                'page': 0
            })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="shop">', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])

        sub, start, end = cm.extract_closure(body[m.end():], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<h3>\s*(.+?)\s*</h3>', sub, re.S)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1)

        m1 = re.search(ur'<p[^>]*>(.+?)</p>', sub, re.S)
        if m1 is not None:
            entry[cm.store_type] = re.sub(re.compile(ur'\s*\+\s*', re.S), ', ',
                                          m1.group(1).strip())

        addr_sub, start, end = cm.extract_closure(sub, ur'<ul\b', ur'</ul>')
        if end != 0:
            tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', addr_sub)
            addr_list = []

            if len(tmp) >= 3:
                entry[cm.tel] = tmp[-1].strip()
                del tmp[-1]

            for term in tmp:
                term = cm.html2plain(term).strip()
                if term != '':
                    addr_list.append(term)
            entry[cm.addr_e] = ', '.join(addr_list)

        start = sub.lower().find(ur'opening hours')
        if start != -1:
            opening_sub, start, end = cm.extract_closure(
                sub[start:], ur'<ul\b', ur'</ul>')
            tmp = re.findall(ur'<li>\s*(.+?)\s*</li>', opening_sub)
            opening_list = []
            for term in tmp:
                term = cm.html2plain(term).strip()
                if term != '':
                    opening_list.append(term)
            entry[cm.hours] = ', '.join(opening_list)

        cm.update_entry(
            entry, {
                cm.continent_e: data['continent'].strip().upper(),
                cm.country_e: data['country'].strip().upper()
            })
        entry[cm.city_e] = cm.extract_city(data['city'])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Example #31
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    province_list = [{cm.province_c: m[1].strip().upper(), cm.url: m[0].strip()}
                     for m in re.findall(ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)]

    comment_pat = re.compile(ur'<!--.*?-->', re.S)
    store_list = []

    for p in province_list:
        start = html.find('<div id="%s">' % p[cm.url])
        if start == -1:
            continue
        p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>', ur'</tbody>')
        p_sub = re.sub(comment_pat, '', p_sub)

        city_c = ''
        city_e = ''
        while True:
            s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>')
            if end == 0:
                break
            p_sub = p_sub[end:]
            if u'城市' in s_sub and u'店铺名称' in s_sub:
                continue

            term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub)
            if len(term_list) < 3:
                continue

            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

            if len(term_list) == 4:
                city_c = term_list[0].strip()
                ret = gs.look_up(city_c, 3)
                if ret is not None:
                    city_e = ret['name_e']
                    city_c = ret['name_c']
                offset = 1
            else:
                offset = 0

            entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip()
            entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip()
            entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip()
            entry[cm.country_e] = 'CHINA'
            entry[cm.continent_e] = 'ASIA'

            p_name_c = p[cm.province_c]
            p_name_e = ''
            ret = gs.look_up(p_name_c, 2)
            if ret is not None:
                p_name_c = ret['name_c']
                p_name_e = ret['name_e']
            cm.update_entry(entry, {cm.province_e: p_name_e, cm.province_c: p_name_c,
                                    cm.city_e: city_e, cm.city_c: city_c})
            entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                              entry[cm.continent_e])
            store_list.append(entry)
            db.insert_record(entry, 'stores')

    return store_list
Example #32
0
            break

        for m1 in re.findall(ur'<div class="store-tel">(.+?)</div>', sub_html, re.S):
            entry[common.tel] = common.extract_tel(m1)
            break

        for m1 in re.findall(ur'<div class="store-opening-hour">\s*?(?:Opening Hours:)?(.+?)</div>', sub_html,
                             re.S):
            entry[common.hours] = common.reformat_addr(m1)
            break

        m1 = re.findall(ur'href="/(.+?)" title="View on map"', sub_html)
        if len(m1) > 0:
            entry[common.url] = host + '/' + m1[0]
            lat, lng = get_coordinates(entry[common.url])
            common.update_entry(entry, {common.lat: lat, common.lng: lng})

        # geo
        city_e = cities[city_id]['name'].strip()
        country_e = cities[city_id]['country']['name'].strip().upper()
        continent_e = cities[city_id]['country']['continent'].strip().upper()
        common.update_entry(entry,
                            {common.city_e: common.extract_city(city_e)[0], common.country_e: country_e, common.continent_e: continent_e})
        gs.field_sense(entry)

        # ret = common.geo_translate(country_e.strip())
        # if len(ret) > 0:
        #     common.update_entry(entry, {common.continent_c: ret[common.continent_c],
        #                                 common.continent_e: ret[common.continent_e],
        #                                 common.country_c: ret[common.country_c],
        #                                 common.country_e: ret[common.country_e]})
Example #33
0
def field_sense(entry):
    # Geo
    country = entry[cm.country_e]
    city = entry[cm.city_e]
    ret = look_up(city, 3)
    ret1 = look_up(country, 1)
    if ret1 is not None:
        country = ret1['name_e']
    if ret is not None and ret['country']['name_e'] == country:
        entry[cm.city_e] = ret['name_e']
        entry[cm.city_c] = ret['name_c']

        prov = ret['province']
        if prov != '':
            ret1 = look_up(prov['name_e'], 2)
            if ret1 is not None:
                entry[cm.province_e] = ret1['name_e']
                entry[cm.province_c] = ret1['name_c']

    province = entry[cm.province_e]
    ret = look_up(province, 2)
    if ret is not None:
        entry[cm.province_e] = ret['name_e']
        entry[cm.province_c] = ret['name_c']

    ret = look_up(country, 1)
    if ret is not None:
        cm.update_entry(entry, {
            cm.country_e: ret['name_e'],
            cm.country_c: ret['name_c']
        })
        ret1 = look_up(ret['continent']['name_e'], 0)
        cm.update_entry(entry, {
            cm.continent_e: ret1['name_e'],
            cm.continent_c: ret1['name_c']
        })

        if entry[cm.zip_code] == '':
            m = None
            if ret['name_e'] == look_up(u'CHINA', 1)['name_e']:
                # 中国邮编
                m = re.match(ur'.*\b(\d{6})\b', entry[cm.addr_e])
            elif ret['name_e'] == look_up(u'UNITED STATES', 1)['name_e']:
                # 美国邮编
                m = re.match(ur'.*\b(\d{5})\b', entry[cm.addr_e])
            elif ret['name_e'] == look_up(u'JAPAN', 1)['name_e']:
                # 日本邮编
                m = re.match(ur'.*\b(\d{3}\-\d{4})\b', entry[cm.addr_e])
            if m is not None:
                entry[cm.zip_code] = m.group(1)

    cm.chn_check(entry)

    if entry[cm.zip_code] == '':
        # 数字和城市,州一起,可能为邮编
        m = re.match(ur'.*\s+(\d{5,})\b', entry[cm.addr_e])
        if m is not None:
            tmp = entry[cm.addr_e][m.end() + 1:]
            terms = re.findall(ur'\b(\S+?)\b', tmp)
            if len(terms) > 0:
                if look_up(terms[0], 2) is not None or look_up(terms[0],
                                                               3) is not None:
                    entry[cm.zip_code] = m.group(1)
            else:
                tmp = entry[cm.addr_e][m.end() - len(m.group(1)) - 1::-1]
                terms = re.findall(ur'\b(\S+?)\b', tmp)
                if len(terms) > 0:
                    if look_up(terms[0][::-1], 2) is not None or look_up(
                            terms[0][::-1], 3) is not None:
                        entry[cm.zip_code] = m.group(1)