Esempio n. 1
0
def fetch_uk(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print "Error in finding %s stores" % data["name"]
        return []
    body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>")
    if end == 0:
        print "Error in finding %s stores" % data["name"]
        return []

    store_list = []
    for m in re.findall(ur"<div>\s*(.+?)\s*</div>", body, re.S):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.country_e] = data["name"]

        addr_list = re.findall(ur"<p>\s*(.+?)\s*</p>", m)
        tel = cm.extract_tel(addr_list[-1])
        if tel != "":
            entry[cm.tel] = tel
            del addr_list[-1]

        if data["name"] == "AUSTRALIA":
            country, province, city = gs.addr_sense(", ".join(addr_list), data["name"])
            if city is not None:
                entry[cm.city_e] = city
            if province is not None:
                entry[cm.province_e] = province
        else:
            city = addr_list[-2].strip().upper()
            entry[cm.city_e] = city
            ret = gs.look_up(city, 3)
            if ret is not None and ret["country"]["name_e"] == gs.look_up("UK", 1)["name_e"]:
                entry[cm.city_e] = ret["name_e"]
            entry[cm.zip_code] = addr_list[-1].strip().upper()
        entry[cm.addr_e] = ", ".join(addr_list)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print "(%s / %d) Found store: %s, %s (%s, %s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.city_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )

        db.insert_record(entry, "stores")
        store_list.append(entry)
Esempio n. 2
0
def fetch_uk(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print 'Error in finding %s stores' % data['name']
        return []
    body, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')
    if end == 0:
        print 'Error in finding %s stores' % data['name']
        return []

    store_list = []
    for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['name']

        addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m)
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]

        if data['name'] == 'AUSTRALIA':
            country, province, city = gs.addr_sense(', '.join(addr_list), data['name'])
            if city is not None:
                entry[cm.city_e] = city
            if province is not None:
                entry[cm.province_e] = province
        else:
            city = addr_list[-2].strip().upper()
            entry[cm.city_e] = city
            ret = gs.look_up(city, 3)
            if ret is not None and ret['country']['name_e'] == gs.look_up('UK', 1)['name_e']:
                entry[cm.city_e] = ret['name_e']
            entry[cm.zip_code] = addr_list[-1].strip().upper()
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                                                              entry[cm.country_e], entry[cm.continent_e])

        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 3
0
def fetch_stores(data):
    url = data['home_url']
    try:
        body = cm.post_data(url, {'lz_sf': data['province'], 'lz_sx': data['city']})
    except Exception:
        cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']),
                'samsonite_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find(u'搜索结果')
    if start == -1:
        cm.dump('Error in fetching stores: %s, %s, %s' % (url, data['province'], data['city']),
                'samsonite_log.txt')
        return []

    body = body[start + 4:]

    store_list = []
    for m in re.findall(ur'</script>\s*(\S+)\s*</span>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.name_e] = m.strip()
        entry[cm.addr_e] = m.strip()
        entry[cm.city_c] = data['city']
        ret = gs.look_up(data['city'], 3)
        if ret is not None:
            entry[cm.city_e] = cm.extract_city(ret['name_e'])[0]
            if ret['province'] != '':
                entry[cm.province_e] = ret['province']['name_e']
        entry[cm.province_c] = data['province']
        ret = gs.look_up(data['province'], 2)
        if ret is not None:
            entry[cm.province_e] = ret['name_e']
        entry[cm.country_e] = u'CHINA'

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'benetton_log.txt', False)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 4
0
def fetch_countries(data):
    url = data['url']
    param = {
        'action': 'getCountriesByContinent',
        'idContinent': data['continent_id'],
        'filter': 'clothing;lacoste%20l!ve'
    }
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    raw = json.loads(body)['root']['DATA']['countries']
    results = []
    for c in raw:
        d = data.copy()
        code = c['country']['iso2']
        d['country_id'] = c['country']['id']
        d['country_code'] = code
        ret = gs.look_up(code, 1)
        if ret is not None:
            uid = gs.country_map['lookup'][code]
            gs.country_map['data'][uid]['iso3'] = c['country']['iso3']
            gs.country_map['lookup'][c['country']['iso3']] = uid
        results.append(d)
    return results
Esempio n. 5
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = html.find('<select name="country" id="inp-country"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>')
    if end == 0:
        return []
    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub):
        d = data.copy()
        d['country_code'] = m[0]
        d[cm.country_c] = m[1].strip()
        for key in [cm.country_e, cm.continent_e, cm.continent_c]:
            d[key] = ''
        ret = gs.look_up(d['country_code'], 1)
        if ret is not None:
            d[cm.country_e] = ret['name_e']
            d[cm.country_c] = ret['name_c']
            d[cm.continent_c] = ret['continent']['name_c']
            d[cm.continent_e] = ret['continent']['name_e']

        country_list.append(d)
Esempio n. 6
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<item id="\d+">', body):
        sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'<country>([^<>]+)</country>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            for v in tmp:
                ret = gs.look_up(v.strip().upper(), 1)
                if ret is not None:
                    entry[cm.country_e] = ret['name_e']
                    break
        m1 = re.search(ur'<city>([^<>]+)</city>', sub)
        if m1 is not None:
            val = cm.reformat_addr(m1.group(1))
            if entry[cm.country_e] == 'UNITED STATES':
                tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(','))
                if len(tmp_list) == 2:
                    if re.search('[A-Z]{2}', tmp_list[1]):
                        entry[cm.province_e] = tmp_list[1]
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<brands>([^<>]+)</brands>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            brand_list = []
            for v in tmp:
                if v.strip() != '':
                    brand_list.append(v)
            entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list)
        m1 = re.search(ur'<name>([^<>]+)</name>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()
        m1 = re.search(ur'<address>([^<>]+)</address>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<tel>([^<>]+)</tel>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 7
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e=ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Esempio n. 8
0
def fetch_stores(data):
    body = data['body']
    start = body.find(u'<ul class="storelist storelist_%s' % data['code'])
    if start == -1:
        cm.dump('Error in finding stores for %s' % data['code'])
        return []
    body = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]

    store_list = []
    for m in re.findall(ur'<li class="sitem">(.+?)</li>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'<h3>(.+?)</h3>', m)
        if m1 is not None:
            entry[cm.name_c] = m1.group(1).strip()
        m1 = re.search(ur'<div class="addr">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.addr_e] = m1.group(1).replace(u'地址:', '').replace(u'地址:', '').strip()
        m1 = re.search(ur'<div class="tel">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).replace(u'电话:', '').replace(u'电话:', '').strip()
        entry[cm.city_c] = data['city']
        ret = gs.look_up(data['city'], 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']
            entry[cm.city_c] = ret['name_c']
            if ret['province'] != '':
                entry[cm.province_e] = ret['province']['name_e']
        entry[cm.country_e] = u'CHINA'
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'canali_log.txt')
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 9
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url, {'brand': 'oasis', 'countryISO': 'GB'})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.+?)</option>', html):
        d = data.copy()
        d['country_code'] = m[0]
        country = m[1].strip().upper()
        ret = gs.look_up(country, 1)
        if ret is not None:
            country = ret['name_e']
        d['country_e'] = country
        country_list.append(d)
Esempio n. 10
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data['url']
    try:
        info = json.loads(cm.get_data(url, {'tskay': data['key_term']}))
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw_list = info['shops']
    store_list = []
    for s in raw_list:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.city_e] = s['city'].strip().upper()
        entry[cm.country_e] = data['country_e'].strip().upper()
        entry[cm.name_e] = s['name'].strip()
        addr = s['address']
        entry[cm.addr_e] = addr

        terms = addr.split(',')
        if len(terms) > 1 and entry[cm.city_e] in terms[-1].strip().upper():
            country = entry['country_e']
            tmp = gs.look_up(country, 1)
            if tmp is not None:
                country = tmp['name_e']
            if country == 'JAPAN':
                # 日本邮编
                m = re.search(ur'\d{3,}[ -\.]+?\d{3,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
            else:
                m = re.search(ur'\d{4,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)

        entry[cm.tel] = s['tel']
        entry[cm.fax] = s['fax']
        entry[cm.email] = s['email']
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Esempio n. 11
0
def fetch_stores(data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    code = data['country_code']
    if gs.look_up(code, 1) is None:
        entry[cm.country_e] = cm.html2plain(data['country']).strip().upper()
    else:
        entry[cm.country_e] = code
    entry[cm.name_e] = data['store_name']
    entry[cm.city_e] = cm.extract_city(data['city'])[0]
    entry[cm.lat] = data['lat'] if data['lat'] is not None else ''
    entry[cm.lng] = data['lng'] if data['lng'] is not None else ''

    m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content'])
    sub = data['content'][m.end():]

    m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub)
    if m1 is not None:
        entry[cm.store_class] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1))
        if m2:
            entry = fetch_details(data, m2.group(1), entry)

        m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S)
        if m2:
            ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(','))
            entry[cm.tel] = cm.extract_tel(ct_list[0])
            if len(ct_list) > 1:
                entry[cm.email] = ct_list[1].strip()

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')

    return tuple(entry)
Esempio n. 12
0
def fetch_cities(data):
    if data['province_e'] != '':
        print '(%s/%d) Found province: %s' % (
            data['brandname_e'], data['brand_id'], data['province_e'])

    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="city">Choose a city</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    city_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        city_e = cm.html2plain(m[1]).strip().upper()
        if data['country_e'] == 'CHINA':
            # 去掉省中间的空格
            city_e = city_e.replace(' ', '')
        ret = gs.look_up(city_e, 3)
        if ret is not None:
            city_e = ret['name_e']

        d = data.copy()
        d['city_e'] = city_e
        d['url'] = data['host'] + m[0]
        city_list.append(d)

    return city_list
Esempio n. 13
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    ret = gs.look_up(data['country'], 1)
    if ret is not None:
        entry[cm.country_e] = ret['name_e']
    m = re.search(ur'<span class="type">Address</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        addr = cm.reformat_addr(m.group(1))
        country, province, city = gs.addr_sense(addr)
        if country is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = country
        if province is not None:
            entry[cm.province_e] = province
        if city is not None:
            entry[cm.city_e] = city
        entry[cm.addr_e] = addr

    m = re.search(ur'<span class="type">Phone</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.tel] = m.group(1)

    m = re.search(ur'<span class="type">Opening hours</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    m = re.search(ur'<span class="type">You can find</span>\s*<p>(.+?)</p>', body, re.S)
    if m is not None:
        entry[cm.store_type] = cm.reformat_addr(m.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', body, re.S)
    entry[cm.lat]=string.atof(m.group(1))
    entry[cm.lng]=string.atof(m.group(2))

    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
Esempio n. 14
0
def fetch_cities(data):
    ret = gs.look_up(data['country'], 1)
    if ret is None:
        return []

    country = ret['name_e']
    city_map = gen_city_map()
    results = []
    if country in city_map:
        for city in city_map[country]:
            d = data.copy()
            d['country'] = country
            d['city'] = city
            d['city_lat'] = city_map[country][city]['lat']
            d['city_lng'] = city_map[country][city]['lng']
            results.append(d)
    return results
Esempio n. 15
0
def fetch_cities(data):
    ret = gs.look_up(data['country'], 1)
    if ret is None:
        return ()

    country = ret['name_e']
    city_map = data['city_map']
    results = []
    if country in city_map:
        for city in city_map[country]:
            d = data.copy()
            d['country'] = country
            d['city'] = city
            d['city_lat'] = city_map[country][city]['lat']
            d['city_lng'] = city_map[country][city]['lng']
            results.append(d)
    return tuple(results)
Esempio n. 16
0
def fetch_cities(data):
    ret = gs.look_up(data["country_code"], 1)
    if ret is None:
        return ()

    country = ret["name_e"]
    city_map = data["city_map"]
    results = []
    if country in city_map:
        for city in city_map[country]:
            d = data.copy()
            d["country"] = country
            d["city"] = city
            d["city_lat"] = city_map[country][city]["lat"]
            d["city_lng"] = city_map[country][city]["lng"]
            results.append(d)
    return tuple(results)
Esempio n. 17
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url, {'brand': 'oasis', 'countryISO': 'GB'})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.+?)</option>', html):
        d = data.copy()
        d['country_code'] = m[0]
        country = m[1].strip().upper()
        ret = gs.look_up(country, 1)
        if ret is not None:
            country = ret['name_e']
        d['country_e'] = country
        country_list.append(d)
Esempio n. 18
0
def fetch_stores(data):
    body = data['body']
    start = body.find(u'<ul class="storelist storelist_%s' % data['code'])
    if start == -1:
        cm.dump('Error in finding stores for %s' % data['code'])
        return []
    body = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]

    store_list = []
    for m in re.findall(ur'<li class="sitem">(.+?)</li>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m1 = re.search(ur'<h3>(.+?)</h3>', m)
        if m1 is not None:
            entry[cm.name_c] = m1.group(1).strip()
        m1 = re.search(ur'<div class="addr">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.addr_e] = m1.group(1).replace(u'地址:',
                                                   '').replace(u'地址:',
                                                               '').strip()
        m1 = re.search(ur'<div class="tel">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).replace(u'电话:',
                                                '').replace(u'电话:',
                                                            '').strip()
        entry[cm.city_c] = data['city']
        ret = gs.look_up(data['city'], 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']
            entry[cm.city_c] = ret['name_c']
            if ret['province'] != '':
                entry[cm.province_e] = ret['province']['name_e']
        entry[cm.country_e] = u'CHINA'
        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            'canali_log.txt')
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 19
0
def get_countries(data):
    """
    返回国家列表
    :rtype : [{'country_code':**, 'country':**}, ...]
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    pat = '<option value="0">Choose a country</option>'
    splits = [m.start() for m in re.finditer(pat, html)]
    splits.append(-1)
    sub_html = []
    for i in xrange(len(splits) - 1):
        sub_html.append(html[splits[i]:splits[i + 1]])

    # 1:州信息
    # s_map = [{'state_code':m[0], 'state':m[1].strip}
    state_list = []
    for m in re.findall(ur'<option value="(.+?)"\s*?>(.+?)</option>', sub_html[0][len(pat):]):
        code = m[0].strip().upper()
        state = m[1].strip().upper()
        ret = gs.look_up(state, 2)
        if ret is not None:
            # state_list.append({'state': ret[0]['province_e'], 'state_code': ret[0]['state_code']})
            state_list.append({'state': ret['name_e'], 'state_code': ret['code']})
        else:
            # state其实是写成是代码
            for key in gs.province_map['data']:
                state = gs.province_map['data'][key]
                if state['code'] == code:
                    state = state['name_e']
                    state_list.append({'state': state, 'state_code': code})
                    break
Esempio n. 20
0
def fetch_cities(data):
    url = data['post_city']
    try:
        html = cm.post_data(url, {'country': data['country_id']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    city_list = []
    for item in pq(html)('#cities option[value!="0"]'):
        d = data.copy()
        city_e = cm.html2plain(item.text).strip().upper()
        ret = gs.look_up(city_e, 3)
        if ret is not None:
            city_e = ret['name_e']
        d['city_e'] = city_e
        city_list.append(d)

    return city_list
Esempio n. 21
0
def fetch_states(data):
    print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'], data['country_e'])
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="state">Choose a state/provence</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    state_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        province_e = cm.html2plain(m[1]).strip().upper()
        if data['country_e'] == 'CHINA':
            # 去掉省中间的空格
            province_e = province_e.replace(' ', '')
        ret = gs.look_up(province_e, 2)
        if ret is not None:
            province_e=ret['name_e']
        d = data.copy()
        d['province_e'] = province_e
        d['url'] = data['host'] + m[0]
        state_list.append(d)

    return state_list
Esempio n. 22
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = html.find('<select name="country" id="inp-country"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b',
                                         ur'</select>')
    if end == 0:
        return []
    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub):
        d = data.copy()
        d['country_code'] = m[0]
        d[cm.country_c] = m[1].strip()
        for key in [cm.country_e, cm.continent_e, cm.continent_c]:
            d[key] = ''
        ret = gs.look_up(d['country_code'], 1)
        if ret is not None:
            d[cm.country_e] = ret['name_e']
            d[cm.country_c] = ret['name_c']
            d[cm.continent_c] = ret['continent']['name_c']
            d[cm.continent_e] = ret['continent']['name_e']

        country_list.append(d)
Esempio n. 23
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    country_list = []
    for item in pq(html)('#country option[value!="reset"]'):
        d = data.copy()
        d['country_id'] = string.atoi(item.attrib['value'])

        country_e = cm.html2plain(item.text).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e = ret['name_e']
        d['country_e'] = country_e
        country_list.append(d)
    return country_list
Esempio n. 24
0
def fetch_countries(data):
    url = data['url']
    param = {'action': 'getCountriesByContinent', 'idContinent': data['continent_id'],
             'filter': 'clothing;lacoste%20l!ve'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    raw = json.loads(body)['root']['DATA']['countries']
    results = []
    for c in raw:
        d = data.copy()
        code = c['country']['iso2']
        d['country_id'] = c['country']['id']
        d['country_code'] = code
        ret = gs.look_up(code, 1)
        if ret is not None:
            uid = gs.country_map['lookup'][code]
            gs.country_map['data'][uid]['iso3'] = c['country']['iso3']
            gs.country_map['lookup'][c['country']['iso3']] = uid
        results.append(d)
    return results
Esempio n. 25
0
def fetch(level=1, data=None, host='localhost', port=3306, user='******', passwd='123456'):
    tot = 0
    start = 0
    store_list = []
    data = {'q': '*:*', 'pt': '0,0', 'd': 100000, 'start': 0, 'rows': 100}
    # data = {'q': '*:*', 'pt': '36.778261,-119.417932', 'd': 50, 'start': 0, 'rows': 100}

    db = cm.StoresDb()
    db.connect_db(host=host, port=port, user=user, passwd=passwd, db='brand_stores')
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    while True:
        cm.dump('Fetching from %d' % start, 'triumph_log.txt')
        try:
            data['start'] = start
            html = cm.get_data(url, data)
            raw_list = json.loads(html)
            if tot == 0:
                tot = raw_list['response']['numFound']
                cm.dump('Found: %d' % tot, 'triumph_log.txt')
            raw_list = raw_list['response']['docs']
        except Exception:
            cm.dump('Error occured while fetching from %d' % data['start'], 'triumph_log.txt')
            dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
            cm.dump(dump_data)
            return []

        idx = 0
        if len(raw_list) < data['rows'] and start + len(raw_list) < tot:
            cm.dump('Cooling down...', 'triumph_log.txt')
            time.sleep(5)
            continue

        for v in raw_list:
            entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
            cm.update_entry(entry, {cm.store_type: v['class'],
                                    cm.zip_code: v['zip'], cm.tel: v['phone'], cm.fax: v['fax'],
                                    cm.url: v['web'], cm.email: v['email'], cm.hours: v['opening_hours']})
            entry[cm.name_e] = cm.reformat_addr(v['name'])

            entry[cm.city_e], tmp = cm.extract_city(v['city'])
            if not re.search(ur'\d', entry[cm.zip_code]) and tmp != '':
                entry[cm.zip_code] = tmp

            if v['location'] != '':
                terms = v['location'].split(',')
                cm.update_entry(entry, {cm.lat: string.atof(terms[0]), cm.lng: string.atof(terms[1])})
            addr = v['address']
            if v['address2'] != '':
                addr += ', ' + v['address2']
            entry[cm.addr_e] = cm.reformat_addr(addr)
            ret = gs.look_up(v['country'], 1)
            if ret is not None:
                entry[cm.country_e] = ret['name_e']
            else:
                cm.dump('Error in looking up country %s' % v['country'], 'triumph_log.txt')
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store at %d: %s, %s (%s, %s, %s)' % (
                brandname_e, brand_id, start + idx, entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                entry[cm.country_e],
                entry[cm.continent_e]), 'triumph_log.txt')
            store_list.append(entry)
            db.insert_record(entry, 'stores')
            idx += 1

        if tot - start <= len(raw_list):
            break
        else:
            start += len(raw_list)
Esempio n. 26
0
def fetch_stores(data):
    """
    获得门店信息
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    # 第二个<ul>...</ul>
    start = 0
    for i in xrange(2):
        start = html.find('<ul>', start)
        if start == -1:
            return []
        start += len('<ul>')
    end = html.find('</ul>', start)
    html = html[start:end]

    store_list = []
    for m in re.findall(ur'<li>(.+?)</li>', html, re.S):
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        entry[common.store_type] = 'FASHION'
        m1 = re.findall(ur'<h2>(.+?)</h2>', m)
        if len(m1) > 0:
            entry[common.name_e] = common.reformat_addr(m1[0])

        # Google Maps网址
        m1 = re.findall(ur'href="(https://maps.google.com/maps[^\s]+?)"', m)
        if len(m1) > 0:
            entry[common.url] = m1[0]

        addr = common.reformat_addr('\n\r'.join([m1 for m1 in re.findall(ur'<p>(.+?)</p>', m)]))
        entry[common.addr_e] = addr
        terms = addr.split(',')

        # 是否所有的geosensing都未命中?
        hit_flag = False

        # 最后一项是否为国家
        country = ''
        ret = gs.look_up(terms[-1], 1)
        if ret is not None:
            entry[common.country_e] = ret['name_e']
            country = ret['name_e']
            terms = terms[:-1]
            hit_flag = True

        # 查找州和城市
        m = re.match(ur'.*(\d{5,})', terms[-1])
        zip_cdt = ''
        if m is not None:
            zip_cdt = m.group(1)
        tmp = re.sub(ur'\d{5,}', '', terms[-1]).strip().upper()
        ret = gs.look_up(terms[-1], 2)
        if ret is not None:
            entry[common.province_e] = ret['name_e']
            entry[common.zip_code] = zip_cdt
            terms = terms[:-1]
            hit_flag = True

        ret = gs.look_up(terms[-1], 3)
        if ret is not None:
            entry[common.city_e] = ret['name_e']
            entry[common.zip_code] = zip_cdt
            hit_flag = True

        if not hit_flag:
            # 所有都未命中,输出:
            common.write_log('Failed in geosensing: %s' % addr)

        gs.field_sense(entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e],
            entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 27
0
            continue
        sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0]

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.store_type] = store_type

        m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

        entry[cm.tel] = cm.extract_tel(sub1)
        ret = gs.look_up(data['country_code'], 1)
        if ret is not None:
            entry[cm.country_e] = ret['name_e']
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
        if ret[2] is not None:
            entry[cm.city_e] = ret[2]
        else:
            entry[cm.city_e] = data['city'].strip().upper()

        if entry[cm.name_e] in latlng_map:
            tmp = latlng_map[entry[cm.name_e]]
            entry[cm.lat] = tmp['lat']
            entry[cm.lng] = tmp['lng']
Esempio n. 28
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data['url']
    try:
        info = json.loads(cm.get_data(url, {'tskay': data['key_term']}))
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    raw_list = info['shops']
    store_list = []
    for s in raw_list:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.city_e] = s['city'].strip().upper()
        entry[cm.country_e] = data['country_e'].strip().upper()
        entry[cm.name_e] = s['name'].strip()
        addr = s['address']
        entry[cm.addr_e] = addr

        terms = addr.split(',')
        if len(terms) > 1 and entry[cm.city_e] in terms[-1].strip().upper():
            country = entry['country_e']
            tmp = gs.look_up(country, 1)
            if tmp is not None:
                country = tmp['name_e']
            if country == 'JAPAN':
                # 日本邮编
                m = re.search(ur'\d{3,}[ -\.]+?\d{3,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
            else:
                m = re.search(ur'\d{4,}', terms[-1])
                if m is not None:
                    entry[cm.zip_code] = m.group(0)

        entry[cm.tel] = s['tel']
        entry[cm.fax] = s['fax']
        entry[cm.email] = s['email']
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
Esempio n. 29
0
    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e'] if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'], lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh['administrative_area_level_1']

        std_sig = u'|'.join((std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True
Esempio n. 30
0
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return ()
    sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]

    store_list = []
    for city_sub in re.findall(ur'<tr>(.+?)</tr>', sub, re.S):
        m = re.search(ur"<td[^<>]+class='shopLocation'\s*>([^<>]+)</td>",
                      city_sub)
        city_c = m.group(1).strip()
        city_e = ''
        if city_c == u'吉隆坡':
            city_e = 'KUALA LUMPUR'
        elif city_c == u'槟城':
            city_e = 'PENANG'
        else:
            ret = gs.look_up(city_c, 3)
            if ret:
                city_e = ret['name_e']
                city_c = ret['name_c']

        m = re.search(ur"<td class='storeName'>(.+?)</td>", city_sub, re.S)
        if not m:
            continue

        for name in (tmp.strip()
                     for tmp in cm.reformat_addr(m.group(1)).split(',')):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])
            entry[cm.country_e] = data['country_code']
            entry[cm.city_e], entry[cm.city_c] = city_e, city_c
            entry[cm.name_e] = name
Esempio n. 31
0
         zip_code = m1
     break
 for m1 in re.findall(ur'<span itemprop="addressLocality">(.*?)</span>', m):
     if len(m1.strip()) > 0:
         city = cm.extract_city(m1)[0]
     break
 for m1 in re.findall(ur'<span itemprop="addressCountry">(.*?)</span>', m):
     if len(m1.strip()) > 0:
         country = m1
     break
 entry[cm.zip_code] = zip_code
 # 没有上述标签的情况
 if street_addr == '':
     tmp = cm.reformat_addr(m)
     terms = tmp.split(',')
     ret = gs.look_up(terms[-1], 1)
     if ret is not None:
     # t2 = cm.geo_translate(terms[-1])
     # if len(t2) != 0:
         # 这是一个国家
         # 把最后的国家项分离出来
         street_addr = ', '.join(terms[:-1])
         entry[cm.addr_e] = cm.reformat_addr(street_addr)
         entry[cm.country_c] = ret['name_c']
         entry[cm.country_e] = ret['name_e']
         entry[cm.continent_c] = ret['continent']['name_c']
         entry[cm.continent_e] = ret['continent']['name_e']
     else:
         if cm.is_chinese(tmp):
             entry[cm.addr_c] = tmp
         else:
Esempio n. 32
0
def fetch(level=1, data=None, user='******', passwd=''):
    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    data = {'s': -89, 'w': -179, 'n': 89, 'e': 179, 'chinese': 0, 'repair': 1, 'store': 1}
    try:
        html = common.get_data(url_init, data)
    except Exception:
        print 'Error occured in getting the list of countries: %s' % url_init
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url_init}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    store_list = []

    store_map = json.loads(html)
    tot = 0
    while True:
        # 得到{'uid':entry}的字典
        tmp = store_map['lists']
        # 是否有'more'
        flag = False
        if 'has_key' not in dir(tmp):
            raw_stores = {}
            for item in tmp:
                if 'more' in item:
                    flag = item['more']
                else:
                    raw_stores[item['nid']] = item
        else:
            raw_stores = tmp
            for k in tmp:
                if 'more' in tmp[k]:
                    flag = tmp[k]['more']
                    break

        # 分析raw_stores
        for k in raw_stores:
            s = raw_stores[k]
            if 'more' in s:
                flag = s['more']
            else:
                entry = common.init_store_entry(brand_id, brandname_e, brandname_c)

                if s['country'] is not None:
                    country_c = s['country'].strip().upper()
                    ret = gs.look_up(country_c, 1)
                    if ret is not None:
                        entry[common.country_e] = ret['name_e']
                        entry[common.country_c] = ret['name_c']
                    else:
                        if common.is_chinese(country_c):
                            entry[common.country_c] = country_c
                        else:
                            entry[common.country_e] = country_c

                if s['address'] is not None:
                    addr = common.reformat_addr(s['address'])
                    if common.is_chinese(addr):
                        entry[common.addr_c] = addr
                    else:
                        entry[common.addr_e] = addr

                city = s['city']
                if city is not None:
                    city = city.strip().upper()
                    ret = gs.look_up(city, 3)
                    if ret is not None:
                        entry[common.city_c] = ret['name_c']
                        entry[common.city_e] = ret['name_e']
                    else:
                        if common.is_chinese(city):
                            entry[common.city_c] = city
                        else:
                            entry[common.city_e] = city

                entry[common.city_e] = common.extract_city(entry[common.city_e])[0]

                if s['email'] is not None:
                    entry[common.email] = s['email']
                if s['fax'] is not None:
                    entry[common.fax] = s['fax']
                if s['latitude'] is not None:
                    entry[common.lat] = string.atof(s['latitude'])
                if s['longitude'] is not None:
                    entry[common.lng] = string.atof(s['longitude'])
                if s['phone'] is not None:
                    entry[common.tel] = s['phone']
                if s['postal_code'] is not None:
                    entry[common.zip_code] = s['postal_code']

                if s['title'] is not None:
                    name = s['title']
                    if common.is_chinese(name):
                        entry[common.name_c] = name
                    else:
                        entry[common.name_e] = name

                if s['operating_hours'] is not None:
                    entry[common.hours] = s['operating_hours']
                if s['url'] is not None:
                    entry[common.url] = host + s['url']

                gs.field_sense(entry)

                print '%s: Found store: %s, %s (%s, %s)' % (
                    brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.country_e],
                    entry[common.continent_e])
                db.insert_record(entry, 'stores')
                store_list.append(entry)

        if flag:
            tot += len(store_map['lists']) - 1
            data['offset'] = tot
            store_map = json.loads(common.get_data(url_more, data))
            continue
        else:
            tot += len(store_map['lists'])
            break
    print 'Found a total of %d stores.' % tot
    db.disconnect_db()
    return store_list
Esempio n. 33
0
    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    results = []
    for c in json.loads(body)['geoEntityLocaleList']:
        d = data.copy()
        d['country_id'] = string.atoi(c['geoEntity']['id'])
        d['country'] = cm.html2plain(c['geoEntity']['name']).strip()
        results.append(d)

    for item in results:
        if gs.look_up(item['country'].upper(), 1) is None:
            print 'Cannot look up %s' % item['country']
    return results


def fetch_states(data):
    url = data['host'] + data['geo_url']
    param = {'lang': 'EN_US', 'geo_id': data['country_id']}

    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching states: %s, %s' % (url, param), log_name)
        return []

    results = []
Esempio n. 34
0
def fetch(level=1, data=None, user='******', passwd=''):
    """

    :param level:
    :param data:
    :param user:
    :param passwd:
    :return:
    """
    try:
        if data is None:
            data = {'url': url}
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured in getting data: %s' % url
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'data': url}, 'brand_id': brand_id}
        common.dump(dump_data)
        return []

    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))


    sub_pat = re.compile(ur'<!--.*?-->', re.S)
    html = re.sub(sub_pat, '', html)
    split_pos = [m.start() for m in re.finditer(ur'<p><span class="contactboldtitle">', html)]
    split_pos.append(-1)
    sub_list = []
    for i in xrange(len(split_pos) - 1):
        sub_list.append(html[split_pos[i]:split_pos[i + 1]])

    store_list = []
    for sub_html in sub_list:
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>', sub_html)
        if len(m) > 0:
            entry[common.name_l] = m[0]
        m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)', sub_html, re.S)
        if len(m) >= 2:
            addr = common.reformat_addr(m[0])
            entry[common.addr_l] = addr
            # 城市,国家和邮编
            addr_splits = addr.split(', ')

            ret = gs.look_up(addr_splits[-1], 1)
            if ret is None:
                print 'Error in geo translating: %s' % addr_splits[-1]
            else:
                entry[common.country_e]=ret['name_e']
                m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2])
                if len(m1) > 0:
                    common.update_entry(entry, {common.city_e: common.extract_city(m1[0][0])[0],
                                                common.zip_code: m1[0][1]})

            # 联系方式
            tmp = m[1]
            m1 = re.findall(ur'[\d\-]{5,}', tmp)
            if len(m1) > 0:
                entry[common.tel] = m1[0]
            m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp)
            if len(m1) > 0:
                entry[common.email] = m1[0].strip()

        gs.field_sense(entry)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_l], entry[common.addr_l], entry[common.country_e],
            entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
Esempio n. 35
0
def fetch(level=1, data=None, user='******', passwd=''):
    """

    :param level:
    :param data:
    :param user:
    :param passwd:
    :return:
    """
    try:
        if data is None:
            data = {'url': url}
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured in getting data: %s' % url
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'data': url
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    sub_pat = re.compile(ur'<!--.*?-->', re.S)
    html = re.sub(sub_pat, '', html)
    split_pos = [
        m.start()
        for m in re.finditer(ur'<p><span class="contactboldtitle">', html)
    ]
    split_pos.append(-1)
    sub_list = []
    for i in xrange(len(split_pos) - 1):
        sub_list.append(html[split_pos[i]:split_pos[i + 1]])

    store_list = []
    for sub_html in sub_list:
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        m = re.findall(ur'<span class="contactboldtitle">(.+?)</span>',
                       sub_html)
        if len(m) > 0:
            entry[common.name_l] = m[0]
        m = re.findall(ur'<span class="storethinlines">(.+?)(?:</span>|</p>)',
                       sub_html, re.S)
        if len(m) >= 2:
            addr = common.reformat_addr(m[0])
            entry[common.addr_l] = addr
            # 城市,国家和邮编
            addr_splits = addr.split(', ')

            ret = gs.look_up(addr_splits[-1], 1)
            if ret is None:
                print 'Error in geo translating: %s' % addr_splits[-1]
            else:
                entry[common.country_e] = ret['name_e']
                m1 = re.findall(ur'(.+?)(\d{3}-\d{4})', addr_splits[-2])
                if len(m1) > 0:
                    common.update_entry(
                        entry, {
                            common.city_e: common.extract_city(m1[0][0])[0],
                            common.zip_code: m1[0][1]
                        })

            # 联系方式
            tmp = m[1]
            m1 = re.findall(ur'[\d\-]{5,}', tmp)
            if len(m1) > 0:
                entry[common.tel] = m1[0]
            m1 = re.findall(ur'href="mailto:(.+?@.+?)"', tmp)
            if len(m1) > 0:
                entry[common.email] = m1[0].strip()

        gs.field_sense(entry)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_l], entry[common.addr_l],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    db.disconnect_db()
Esempio n. 36
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<item id="\d+">', body):
        sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m1 = re.search(ur'<country>([^<>]+)</country>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            for v in tmp:
                ret = gs.look_up(v.strip().upper(), 1)
                if ret is not None:
                    entry[cm.country_e] = ret['name_e']
                    break
        m1 = re.search(ur'<city>([^<>]+)</city>', sub)
        if m1 is not None:
            val = cm.reformat_addr(m1.group(1))
            if entry[cm.country_e] == 'UNITED STATES':
                tmp_list = tuple(tmp.strip()
                                 for tmp in cm.reformat_addr(val).strip(','))
                if len(tmp_list) == 2:
                    if re.search('[A-Z]{2}', tmp_list[1]):
                        entry[cm.province_e] = tmp_list[1]
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<brands>([^<>]+)</brands>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            brand_list = []
            for v in tmp:
                if v.strip() != '':
                    brand_list.append(v)
            entry[cm.store_type] = ', '.join(brand_map[key]
                                             for key in brand_list)
        m1 = re.search(ur'<name>([^<>]+)</name>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()
        m1 = re.search(ur'<address>([^<>]+)</address>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<tel>([^<>]+)</tel>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 37
0
def fetch(level=1, data=None, user='******', passwd=''):
    db = common.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', brand_id))

    data = {
        's': -89,
        'w': -179,
        'n': 89,
        'e': 179,
        'chinese': 0,
        'repair': 1,
        'store': 1
    }
    try:
        html = common.get_data(url_init, data)
    except Exception:
        print 'Error occured in getting the list of countries: %s' % url_init
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'data': url_init
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    store_list = []

    store_map = json.loads(html)
    tot = 0
    while True:
        # 得到{'uid':entry}的字典
        tmp = store_map['lists']
        # 是否有'more'
        flag = False
        if 'has_key' not in dir(tmp):
            raw_stores = {}
            for item in tmp:
                if 'more' in item:
                    flag = item['more']
                else:
                    raw_stores[item['nid']] = item
        else:
            raw_stores = tmp
            for k in tmp:
                if 'more' in tmp[k]:
                    flag = tmp[k]['more']
                    break

        # 分析raw_stores
        for k in raw_stores:
            s = raw_stores[k]
            if 'more' in s:
                flag = s['more']
            else:
                entry = common.init_store_entry(brand_id, brandname_e,
                                                brandname_c)

                if s['country'] is not None:
                    country_c = s['country'].strip().upper()
                    ret = gs.look_up(country_c, 1)
                    if ret is not None:
                        entry[common.country_e] = ret['name_e']
                        entry[common.country_c] = ret['name_c']
                    else:
                        if common.is_chinese(country_c):
                            entry[common.country_c] = country_c
                        else:
                            entry[common.country_e] = country_c

                if s['address'] is not None:
                    addr = common.reformat_addr(s['address'])
                    if common.is_chinese(addr):
                        entry[common.addr_c] = addr
                    else:
                        entry[common.addr_e] = addr

                city = s['city']
                if city is not None:
                    city = city.strip().upper()
                    ret = gs.look_up(city, 3)
                    if ret is not None:
                        entry[common.city_c] = ret['name_c']
                        entry[common.city_e] = ret['name_e']
                    else:
                        if common.is_chinese(city):
                            entry[common.city_c] = city
                        else:
                            entry[common.city_e] = city

                entry[common.city_e] = common.extract_city(
                    entry[common.city_e])[0]

                if s['email'] is not None:
                    entry[common.email] = s['email']
                if s['fax'] is not None:
                    entry[common.fax] = s['fax']
                if s['latitude'] is not None:
                    entry[common.lat] = string.atof(s['latitude'])
                if s['longitude'] is not None:
                    entry[common.lng] = string.atof(s['longitude'])
                if s['phone'] is not None:
                    entry[common.tel] = s['phone']
                if s['postal_code'] is not None:
                    entry[common.zip_code] = s['postal_code']

                if s['title'] is not None:
                    name = s['title']
                    if common.is_chinese(name):
                        entry[common.name_c] = name
                    else:
                        entry[common.name_e] = name

                if s['operating_hours'] is not None:
                    entry[common.hours] = s['operating_hours']
                if s['url'] is not None:
                    entry[common.url] = host + s['url']

                gs.field_sense(entry)

                print '%s: Found store: %s, %s (%s, %s)' % (
                    brandname_e, entry[common.name_e], entry[common.addr_e],
                    entry[common.country_e], entry[common.continent_e])
                db.insert_record(entry, 'stores')
                store_list.append(entry)

        if flag:
            tot += len(store_map['lists']) - 1
            data['offset'] = tot
            store_map = json.loads(common.get_data(url_more, data))
            continue
        else:
            tot += len(store_map['lists'])
            break
    print 'Found a total of %d stores.' % tot
    db.disconnect_db()
    return store_list
Esempio n. 38
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    province_list = [{cm.province_c: m[1].strip().upper(), cm.url: m[0].strip()}
                     for m in re.findall(ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)]

    comment_pat = re.compile(ur'<!--.*?-->', re.S)
    store_list = []

    for p in province_list:
        start = html.find('<div id="%s">' % p[cm.url])
        if start == -1:
            continue
        p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>', ur'</tbody>')
        p_sub = re.sub(comment_pat, '', p_sub)

        city_c = ''
        city_e = ''
        while True:
            s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>')
            if end == 0:
                break
            p_sub = p_sub[end:]
            if u'城市' in s_sub and u'店铺名称' in s_sub:
                continue

            term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub)
            if len(term_list) < 3:
                continue

            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

            if len(term_list) == 4:
                city_c = term_list[0].strip()
                ret = gs.look_up(city_c, 3)
                if ret is not None:
                    city_e = ret['name_e']
                    city_c = ret['name_c']
                offset = 1
            else:
                offset = 0

            entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip()
            entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip()
            entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip()
            entry[cm.country_e] = 'CHINA'
            entry[cm.continent_e] = 'ASIA'

            p_name_c = p[cm.province_c]
            p_name_e = ''
            ret = gs.look_up(p_name_c, 2)
            if ret is not None:
                p_name_c = ret['name_c']
                p_name_e = ret['name_e']
            cm.update_entry(entry, {cm.province_e: p_name_e, cm.province_c: p_name_c,
                                    cm.city_e: city_e, cm.city_c: city_c})
            entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                              entry[cm.continent_e])
            store_list.append(entry)
            db.insert_record(entry, 'stores')

    return store_list
Esempio n. 39
0
            return []
    else:
        body = data["body"]

    store_list = []
    city = data["city"]
    if city == "":
        m = re.search(ur'<span id="m_sthead"\s*>(.+?)</span>', body)
        if m is not None:
            city = cm.reformat_addr(m.group(1))
    city = city.replace(u"市", u"").strip()
    for m in re.finditer(ur'<span id="m_stname"[^<>]*>(.+?)</span>', body):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.country_e] = data["country"]
        entry[cm.province_c] = data["province"]
        ret = gs.look_up(data["province"], 2)
        if ret is not None:
            entry[cm.province_e] = ret["name_e"]
        entry[cm.city_c] = city
        ret = gs.look_up(city, 3)
        if ret is not None:
            entry[cm.city_e] = ret["name_e"]

        entry[cm.name_e] = cm.reformat_addr(m.group(1))

        m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end() :])
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(",")
            tel = cm.extract_tel(addr_list[-1]).strip()
            if tel != "":
                del addr_list[-1]
Esempio n. 40
0
def fetch_stores(data):
    url = data['url']
    try:
        html, cookie_map = cm.get_data_cookie(url)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    print 'SLEEPING>>>>'
    time.sleep(5)

    m = re.search('http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}', html)
    if m is None:
        return []
    url = m.group(0)

    cookie_map_new = {}
    for key in cookie_map:
        if 'dwpersonalization_' in key or key == 'sr_token':
            continue
        cookie_map_new[key] = cookie_map[key]
    cookie_map_new['invited_visitor_22225'] = '1'
    cookie_map = cookie_map_new


    try:
        html = cm.post_data(url, {'dwfrm_storelocator_startaddress': 'kingman',
                                         'dwfrm_storelocator_maxDistance': 30.00,
                                         'dwfrm_storelocator_outlet': 'true',
                                         'dwfrm_storelocator_retail': 'true',
                                         'dwfrm_storelocator_optical': 'true',
                                         'dwfrm_storelocator_eyewear': 'true',
                                         'dwfrm_storelocator_apparel': 'true',
                                         'dwfrm_storelocator_attire': 'true',
                                         'dwfrm_storelocator_department': 'true',
                                         'dwfrm_storelocator_IsMensFootwear': 'true',
                                         'dwfrm_storelocator_IsRRR': 'true',
                                         'dwfrm_storelocator_IsRRNY': 'true',
                                         'dwfrm_storelocator_IsRRS': 'true',
                                         'dwfrm_storelocator_wholesale': 'true',
                                         'dwfrm_storelocator_bba': 'true',
                                         'dwfrm_storelocator_ba': 'true',
                                         'dwfrm_storelocator_search.x': 0,
                                         'dwfrm_storelocator_search.y': 0,
                                         'dwfrm_storelocator_countryCode': 'US',
                                         'dwfrm_storelocator_postalCode': '67068',
                                         'dwfrm_storelocator_distanceUnit': 'mi',
                                         'dwfrm_storelocator_long': -98.117208,
                                         'dwfrm_storelocator_lat': 37.647131,}, cookie=cookie_map)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="storeColumnOne">', html):
        sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b', ur'</div>')
        if end == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.name_e] = m2.group(1).strip()

        addr_list = [m2 for m2 in re.findall(ur'<div class="adddressline">([^<>]+)</div>', sub)]
        entry[cm.addr_e] = ', '.join(addr_list)

        m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub)
        if m2 is not None:
            tmp = cm.reformat_addr(m2.group(1))
            terms = re.split('[, ]+', tmp)
            if len(terms) < 3:
                entry[cm.addr_e] = tmp
            else:
                ret = gs.look_up(terms[0], 3)
                if ret is not None:
                    entry[cm.city_e] = ret['name_e']
                else:
                    entry[cm.city_e] = terms[0].strip().upper()

                ret = gs.look_up(terms[1], 2)
                if ret is not None:
                    entry[cm.province_e] = ret['name_e']
                else:
                    entry[cm.province_e] = terms[0].strip().upper()

                if re.match('\s*\d{5,}\s*', terms[2]) is not None:
                    entry[cm.zip_code] = terms[2].strip()

        m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.tel] = m2.group(1)

        cm.update_entry(entry, {'country_e': 'UNITED STATES', 'continent_e': 'NORTH AMERICA'})
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Esempio n. 41
0
def fetch_stores(data):
    url = data['home_url']
    try:
        body = cm.post_data(url, {
            'lz_sf': data['province'],
            'lz_sx': data['city']
        })
    except Exception:
        cm.dump(
            'Error in fetching stores: %s, %s, %s' %
            (url, data['province'], data['city']), 'samsonite_log.txt')
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = body.find(u'搜索结果')
    if start == -1:
        cm.dump(
            'Error in fetching stores: %s, %s, %s' %
            (url, data['province'], data['city']), 'samsonite_log.txt')
        return []

    body = body[start + 4:]

    store_list = []
    for m in re.findall(ur'</script>\s*(\S+)\s*</span>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.name_e] = m.strip()
        entry[cm.addr_e] = m.strip()
        entry[cm.city_c] = data['city']
        ret = gs.look_up(data['city'], 3)
        if ret is not None:
            entry[cm.city_e] = cm.extract_city(ret['name_e'])[0]
            if ret['province'] != '':
                entry[cm.province_e] = ret['province']['name_e']
        entry[cm.province_c] = data['province']
        ret = gs.look_up(data['province'], 2)
        if ret is not None:
            entry[cm.province_e] = ret['name_e']
        entry[cm.country_e] = u'CHINA'

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            'benetton_log.txt', False)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 42
0
    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    results = []
    for c in json.loads(body)['geoEntityLocaleList']:
        d = data.copy()
        d['country_id'] = string.atoi(c['geoEntity']['id'])
        d['country'] = cm.html2plain(c['geoEntity']['name']).strip()
        results.append(d)

    for item in results:
        if gs.look_up(item['country'].upper(), 1) is None:
            print 'Cannot look up %s' % item['country']
    return results


def fetch_states(data):
    url = data['host'] + data['geo_url']
    param = {'lang': 'EN_US', 'geo_id': data['country_id']}

    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching states: %s, %s' % (url, param), log_name)
        return []

    results = []
Esempio n. 43
0
    param = {'country': data['country_tag'], 'city': data['city_tag'],
             'adutl': ' 01', 'kids': ' 02', 'undercolor': ' 06', 'togetmap': 'mapdata'}
    try:
        body = cm.get_data(url, param)
    except Exception, e:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return ()
    body = cm.extract_closure(body, ur'\(', ur'\)')[0][1:-1]
    sub = json.loads(body)['data']['xml_dt']
    store_list = []
    for m in re.findall(ur'<marker[^<>]+/\s*>', sub):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = cm.extract_city(data['city'])[0]

        ret = gs.look_up(entry[cm.country_e], 1)
        if ret and ret['name_e'] == 'UNITED STATES':
            tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(data['city']).strip(','))
            if len(tmp_list) == 2:
                if re.search('[A-Z]{2}', tmp_list[1]):
                    entry[cm.province_e] = tmp_list[1]

        # m1 = re.search(ur'name\s*=\s*"([^"]+)"', m)
        # entry[cm.name_e] = m1.group(1) if m1 else ''

        m1 = re.search(ur'address\s*=\s*"([^"]+)"', m)
        if m1:
            addr = re.sub(ur'\.textmap\{.*\}', '', cm.reformat_addr(m1.group(1)))
            addr_list = [tmp.strip() for tmp in addr.split(',')]
            tel = cm.extract_tel(addr_list[-1])
            if tel != '':
Esempio n. 44
0
def fetch_stores(data):
    """
    获得门店信息
    :param data:
    :return:
    """
    url = data['url']
    try:
        html = common.get_data(data['url'])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return []

    # 第二个<ul>...</ul>
    start = 0
    for i in xrange(2):
        start = html.find('<ul>', start)
        if start == -1:
            return []
        start += len('<ul>')
    end = html.find('</ul>', start)
    html = html[start:end]

    store_list = []
    for m in re.findall(ur'<li>(.+?)</li>', html, re.S):
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        entry[common.store_type] = 'FASHION'
        m1 = re.findall(ur'<h2>(.+?)</h2>', m)
        if len(m1) > 0:
            entry[common.name_e] = common.reformat_addr(m1[0])

        # Google Maps网址
        m1 = re.findall(ur'href="(https://maps.google.com/maps[^\s]+?)"', m)
        if len(m1) > 0:
            entry[common.url] = m1[0]

        addr = common.reformat_addr('\n\r'.join(
            [m1 for m1 in re.findall(ur'<p>(.+?)</p>', m)]))
        entry[common.addr_e] = addr
        terms = addr.split(',')

        # 是否所有的geosensing都未命中?
        hit_flag = False

        # 最后一项是否为国家
        country = ''
        ret = gs.look_up(terms[-1], 1)
        if ret is not None:
            entry[common.country_e] = ret['name_e']
            country = ret['name_e']
            terms = terms[:-1]
            hit_flag = True

        # 查找州和城市
        m = re.match(ur'.*(\d{5,})', terms[-1])
        zip_cdt = ''
        if m is not None:
            zip_cdt = m.group(1)
        tmp = re.sub(ur'\d{5,}', '', terms[-1]).strip().upper()
        ret = gs.look_up(terms[-1], 2)
        if ret is not None:
            entry[common.province_e] = ret['name_e']
            entry[common.zip_code] = zip_cdt
            terms = terms[:-1]
            hit_flag = True

        ret = gs.look_up(terms[-1], 3)
        if ret is not None:
            entry[common.city_e] = ret['name_e']
            entry[common.zip_code] = zip_cdt
            hit_flag = True

        if not hit_flag:
            # 所有都未命中,输出:
            common.write_log('Failed in geosensing: %s' % addr)

        gs.field_sense(entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, entry[common.name_e], entry[common.addr_e],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 45
0
            cm.dump('Error in parsing %s' % m.group(1), log_name)
            continue
        sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0]

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.store_type] = store_type

        m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

        entry[cm.tel] = cm.extract_tel(sub1)
        ret = gs.look_up(data['country_code'], 1)
        if ret is not None:
            entry[cm.country_e] = ret['name_e']
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
        if ret[2] is not None:
            entry[cm.city_e] = ret[2]
        else:
            entry[cm.city_e] = data['city'].strip().upper()

        if entry[cm.name_e] in latlng_map:
            tmp = latlng_map[entry[cm.name_e]]
            entry[cm.lat] = tmp['lat']
            entry[cm.lng] = tmp['lng']
Esempio n. 46
0
    def register_city(geocoded_info):
        candidate_geo = None
        for geo_info in geocoded_info:
            admin_info = geo_info['administrative_info']
            if 'country' not in admin_info:
                common.dump(u'Country info does not exist: %s' % admin_info)
                continue

            if 'locality' in admin_info:
                city = admin_info['locality']
            elif 'sublocality' in admin_info:
                city = admin_info['sublocality']
            elif 'administrative_area_level_3' in admin_info:
                city = admin_info['administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info:
                city = admin_info['administrative_area_level_2']
            else:
                common.dump(u'City info does not exist: %s' % admin_info)
                continue

            tmp_geo = {'city_e': city, 'country_e': admin_info['country']}
            if 'administrative_area_level_1' in admin_info:
                tmp_geo['region_e'] = admin_info['administrative_area_level_1']
            else:
                tmp_geo['region_e'] = ''
            tmp_geo['formatted_address'] = geo_info['formatted_address']

            if not candidate_geo:
                candidate_geo = tmp_geo
                # 检验一致性,国家或城市信息必须一致
            ret1 = gs.look_up(country_e, 1)
            ret2 = gs.look_up(admin_info['country'], 1)
            if (ret1['name_e'] if ret1 else country_e) != (
                    ret2['name_e'] if ret2 else admin_info['country']):
                common.dump(u'Countries does not match.', log_name)
                ret3 = gs.look_up(city_e, 1)
                ret4 = gs.look_up(city, 1)
                if (ret3['name_e'] if ret3 else city_e) != (ret4['name_e']
                                                            if ret4 else city):
                    common.dump(u'Cities does not match.', log_name)
                    continue

            # 如果走到这一步,说明geo_info通过了上述检验,可以使用
            candidate_geo = tmp_geo
            break

        # candidate_geo是正确的地理信息
        if not candidate_geo:
            return False

        # 登记城市标准化信息
        std_info = candidate_geo

        # 获得中文信息
        std_info['country_c'] = ''
        std_info['region_c'] = ''
        std_info['city_c'] = ''
        geocoded_info_zh = gs.geocode(addr=candidate_geo['formatted_address'],
                                      lang='zh')
        if geocoded_info_zh:
            admin_info_zh = geocoded_info_zh[0]['administrative_info']
            if 'country' in admin_info_zh:
                std_info['country_c'] = admin_info_zh['country']
            if 'locality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['locality']
            elif 'sublocality' in admin_info_zh:
                std_info['city_c'] = admin_info_zh['sublocality']
            elif 'administrative_area_level_3' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_3']
            elif 'administrative_area_level_2' in admin_info_zh:
                std_info['city_c'] = admin_info_zh[
                    'administrative_area_level_2']
            if 'administrative_area_level_1' in admin_info_zh:
                std_info['region_c'] = admin_info_zh[
                    'administrative_area_level_1']

        std_sig = u'|'.join(
            (std_info['city_e'], std_info['region_e'], std_info['country_e']))
        city_std[sig] = {'std_sig': std_sig}
        if 'std_sig' not in city_std:
            city_std[std_sig] = {'std_info': std_info, 'geo_info': geo_info}
        common.dump(u'%s => %s' % (sig, std_sig))
        return True
Esempio n. 47
0
def get_frag_stores(data):
    try:
        html = common.get_data(data['url'], {'country': data['country'], 'city_postal': '', 'page': data['page']})
    except Exception:
        print 'Error occured: %s' % url_fragrance
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance},
                     'brand_id': brand_id}
        common.dump(dump_data)
        return [], False

    print 'PARSING PAGE: %d' % data['page']
    start = html.find('<section id="content" class="content">')
    if start == -1:
        return [], False
    html, start, end = common.extract_closure(html[start:], ur'<section\b', ur'</section>')
    if end == 0:
        return [], False

    # 找到总页面数量
    tot_page = 0
    start = html.find('<div class="pagination">')
    if start != -1:
        pagination, start, end = common.extract_closure(html[start:], ur'<div\b', ur'</div>')
        m = re.findall(ur'<a href=".*?" class="page">(\d+)</a>', pagination)
        if len(m) > 0:
            tot_page = string.atoi(m[-1])

    # 开始寻找门店
    store_list = []
    for m in re.findall(ur'<li>(.*?)</li>', html, re.S):
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        entry[common.store_type] = 'FRAGRANCE'
        m1 = re.findall(ur'<h2>(.+?)</h2>', m)
        if len(m1) > 0:
            entry[common.name_e] = common.html2plain(m1[0].strip())

        m1 = re.findall(ur'href="(.+?)"', m)
        if len(m1) > 0:
            entry[common.url] = m1[0]

        addr = common.reformat_addr(','.join(re.findall(ur'<p>(.+?)</p>', m)))
        entry[common.addr_e] = addr
        terms = addr.split(', ')
        ret = gs.look_up(terms[-1], 1)
        if ret is not None:
            entry[common.country_e] = ret['name_e']

        if len(terms)>=2:
            m1 = re.match(ur'.*?(\d+)\s+(.*)', terms[-2])
            if m1 is not None:
                ret = gs.look_up(m1.group(2).strip().upper(), 3)
                if ret is not None:
                    entry[common.city_e] = ret['name_e']
                else:
                    if len(re.findall('(\S+)', m1.group(2).strip().upper()))==1 and \
                                    len(re.findall('(\d+)', m1.group(2).strip().upper()))==0:
                        entry[common.city_e] = m1.group(2).strip().upper()
                        entry[common.zip_code] = m1.group(1).strip()

        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            brandname_e, brand_id, entry[common.name_e], entry[common.addr_e], entry[common.country_e],
            entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 48
0
def fetch(level=1, data=None, user='******', passwd=''):
    # Walk from the root node, where level == 1.
    if data is None:
        data = {
            'url': 'http://cms.destinationkors.com/store/get',
            'brand_id': 10259,
            'brandname_e': u'Michael Kors',
            'brandname_c': u'迈克.柯尔'
        }

    type_desc = ['Collection Boutique', 'Lifestyle', 'Outlet']

    global db
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' %
               ('stores', data['brand_id']))

    store_list = []
    url = data['url']
    try:
        html = cm.get_data(url).decode('unicode_escape')
        start = html.find('[')
        if start == -1:
            return []
        js = json.loads(html[start:])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    for s in js:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.store_type] = type_desc[string.atoi(s['store_type']) - 1]
        name = s['name'].strip()
        if s['name2'].strip() != '':
            name += ', ' + s['name2'].strip()
        entry[cm.name_e] = name

        addr = []
        for i in xrange(3):
            tmp = s['address%d' % (i + 1)].strip()
            if tmp != '':
                addr.append(tmp)
        entry[cm.addr_e] = ', '.join(addr)
        entry[cm.city_e] = cm.extract_city(s['city'])[0]

        country = s['country']
        ret = gs.look_up(country, 1)
        if ret is not None:
            country = ret['name_e']
        entry[cm.country_e] = country

        state = s['state'].strip().upper()
        if country == 'UNITED STATES' and state != '':
            ret = gs.look_up(state, 2)
            if ret is not None:
                entry[cm.province_e] = ret['name_e']
        else:
            entry[cm.province_e] = state

        entry[cm.zip_code] = s['zip']
        entry[cm.tel] = s['phone']
        entry[cm.hours] = s['hours']
        entry[cm.lat] = string.atof(s['latitude'])
        entry[cm.lng] = string.atof(s['longitude'])
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')

    db.disconnect_db()
    return store_list
Esempio n. 49
0
 for m1 in re.findall(
         ur'<span itemprop="addressLocality">(.*?)</span>', m):
     if len(m1.strip()) > 0:
         city = cm.extract_city(m1)[0]
     break
 for m1 in re.findall(
         ur'<span itemprop="addressCountry">(.*?)</span>', m):
     if len(m1.strip()) > 0:
         country = m1
     break
 entry[cm.zip_code] = zip_code
 # 没有上述标签的情况
 if street_addr == '':
     tmp = cm.reformat_addr(m)
     terms = tmp.split(',')
     ret = gs.look_up(terms[-1], 1)
     if ret is not None:
         # t2 = cm.geo_translate(terms[-1])
         # if len(t2) != 0:
         # 这是一个国家
         # 把最后的国家项分离出来
         street_addr = ', '.join(terms[:-1])
         entry[cm.addr_e] = cm.reformat_addr(street_addr)
         entry[cm.country_c] = ret['name_c']
         entry[cm.country_e] = ret['name_e']
         entry[cm.continent_c] = ret['continent']['name_c']
         entry[cm.continent_e] = ret['continent']['name_e']
     else:
         if cm.is_chinese(tmp):
             entry[cm.addr_c] = tmp
         else:
Esempio n. 50
0
def get_frag_stores(data):
    try:
        html = common.get_data(data['url'], {
            'country': data['country'],
            'city_postal': '',
            'page': data['page']
        })
    except Exception:
        print 'Error occured: %s' % url_fragrance
        dump_data = {
            'level': 1,
            'time': common.format_time(),
            'data': {
                'url': url_fragrance
            },
            'brand_id': brand_id
        }
        common.dump(dump_data)
        return [], False

    print 'PARSING PAGE: %d' % data['page']
    start = html.find('<section id="content" class="content">')
    if start == -1:
        return [], False
    html, start, end = common.extract_closure(html[start:], ur'<section\b',
                                              ur'</section>')
    if end == 0:
        return [], False

    # 找到总页面数量
    tot_page = 0
    start = html.find('<div class="pagination">')
    if start != -1:
        pagination, start, end = common.extract_closure(
            html[start:], ur'<div\b', ur'</div>')
        m = re.findall(ur'<a href=".*?" class="page">(\d+)</a>', pagination)
        if len(m) > 0:
            tot_page = string.atoi(m[-1])

    # 开始寻找门店
    store_list = []
    for m in re.findall(ur'<li>(.*?)</li>', html, re.S):
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        entry[common.store_type] = 'FRAGRANCE'
        m1 = re.findall(ur'<h2>(.+?)</h2>', m)
        if len(m1) > 0:
            entry[common.name_e] = common.html2plain(m1[0].strip())

        m1 = re.findall(ur'href="(.+?)"', m)
        if len(m1) > 0:
            entry[common.url] = m1[0]

        addr = common.reformat_addr(','.join(re.findall(ur'<p>(.+?)</p>', m)))
        entry[common.addr_e] = addr
        terms = addr.split(', ')
        ret = gs.look_up(terms[-1], 1)
        if ret is not None:
            entry[common.country_e] = ret['name_e']

        if len(terms) >= 2:
            m1 = re.match(ur'.*?(\d+)\s+(.*)', terms[-2])
            if m1 is not None:
                ret = gs.look_up(m1.group(2).strip().upper(), 3)
                if ret is not None:
                    entry[common.city_e] = ret['name_e']
                else:
                    if len(re.findall('(\S+)', m1.group(2).strip().upper()))==1 and \
                                    len(re.findall('(\d+)', m1.group(2).strip().upper()))==0:
                        entry[common.city_e] = m1.group(2).strip().upper()
                        entry[common.zip_code] = m1.group(1).strip()

        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            brandname_e, brand_id, entry[common.name_e], entry[common.addr_e],
            entry[common.country_e], entry[common.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Esempio n. 51
0
def fetch_stores(data):
    url = data['url']
    try:
        html, cookie_map = cm.get_data_cookie(url)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    print 'SLEEPING>>>>'
    time.sleep(5)

    m = re.search(
        'http://www.ninewest.com/on/demandware.store/Sites-ninewest-Site/default/Stores-Find/C\d{10}',
        html)
    if m is None:
        return []
    url = m.group(0)

    cookie_map_new = {}
    for key in cookie_map:
        if 'dwpersonalization_' in key or key == 'sr_token':
            continue
        cookie_map_new[key] = cookie_map[key]
    cookie_map_new['invited_visitor_22225'] = '1'
    cookie_map = cookie_map_new

    try:
        html = cm.post_data(url, {
            'dwfrm_storelocator_startaddress': 'kingman',
            'dwfrm_storelocator_maxDistance': 30.00,
            'dwfrm_storelocator_outlet': 'true',
            'dwfrm_storelocator_retail': 'true',
            'dwfrm_storelocator_optical': 'true',
            'dwfrm_storelocator_eyewear': 'true',
            'dwfrm_storelocator_apparel': 'true',
            'dwfrm_storelocator_attire': 'true',
            'dwfrm_storelocator_department': 'true',
            'dwfrm_storelocator_IsMensFootwear': 'true',
            'dwfrm_storelocator_IsRRR': 'true',
            'dwfrm_storelocator_IsRRNY': 'true',
            'dwfrm_storelocator_IsRRS': 'true',
            'dwfrm_storelocator_wholesale': 'true',
            'dwfrm_storelocator_bba': 'true',
            'dwfrm_storelocator_ba': 'true',
            'dwfrm_storelocator_search.x': 0,
            'dwfrm_storelocator_search.y': 0,
            'dwfrm_storelocator_countryCode': 'US',
            'dwfrm_storelocator_postalCode': '67068',
            'dwfrm_storelocator_distanceUnit': 'mi',
            'dwfrm_storelocator_long': -98.117208,
            'dwfrm_storelocator_lat': 37.647131,
        },
                            cookie=cookie_map)
    except Exception:
        print 'Error occured in getting country list: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="storeColumnOne">', html):
        sub, start, end = cm.extract_closure(html[m1.start():], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m2 = re.search(ur'<div class="storename">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.name_e] = m2.group(1).strip()

        addr_list = [
            m2 for m2 in re.findall(
                ur'<div class="adddressline">([^<>]+)</div>', sub)
        ]
        entry[cm.addr_e] = ', '.join(addr_list)

        m2 = re.search(ur'<div class="citystatezip">([^<>]+)</div>', sub)
        if m2 is not None:
            tmp = cm.reformat_addr(m2.group(1))
            terms = re.split('[, ]+', tmp)
            if len(terms) < 3:
                entry[cm.addr_e] = tmp
            else:
                ret = gs.look_up(terms[0], 3)
                if ret is not None:
                    entry[cm.city_e] = ret['name_e']
                else:
                    entry[cm.city_e] = terms[0].strip().upper()

                ret = gs.look_up(terms[1], 2)
                if ret is not None:
                    entry[cm.province_e] = ret['name_e']
                else:
                    entry[cm.province_e] = terms[0].strip().upper()

                if re.match('\s*\d{5,}\s*', terms[2]) is not None:
                    entry[cm.zip_code] = terms[2].strip()

        m2 = re.search(ur'<div class="storephone">([^<>]+)</div>', sub)
        if m2 is not None:
            entry[cm.tel] = m2.group(1)

        cm.update_entry(entry, {
            'country_e': 'UNITED STATES',
            'continent_e': 'NORTH AMERICA'
        })
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
Esempio n. 52
0
def fetch(level=1, data=None, user='******', passwd=''):
    # Walk from the root node, where level == 1.
    if data is None:
        data = {'url': 'http://cms.destinationkors.com/store/get',
                'brand_id': 10259, 'brandname_e': u'Michael Kors', 'brandname_c': u'迈克.柯尔'}

    type_desc = ['Collection Boutique', 'Lifestyle', 'Outlet']

    global db
    db = cm.StoresDb()
    db.connect_db(user=user, passwd=passwd)
    db.execute(u'DELETE FROM %s WHERE brand_id=%d' % ('stores', data['brand_id']))

    store_list = []
    url = data['url']
    try:
        html = cm.get_data(url).decode('unicode_escape')
        start = html.find('[')
        if start == -1:
            return []
        js = json.loads(html[start:])
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    for s in js:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.store_type] = type_desc[string.atoi(s['store_type']) - 1]
        name = s['name'].strip()
        if s['name2'].strip() != '':
            name += ', ' + s['name2'].strip()
        entry[cm.name_e] = name

        addr = []
        for i in xrange(3):
            tmp = s['address%d' % (i + 1)].strip()
            if tmp != '':
                addr.append(tmp)
        entry[cm.addr_e] = ', '.join(addr)
        entry[cm.city_e] = cm.extract_city(s['city'])[0]

        country = s['country']
        ret =  gs.look_up(country, 1)
        if ret is not None:
            country=ret['name_e']
        entry[cm.country_e] = country

        state = s['state'].strip().upper()
        if country=='UNITED STATES' and state != '':
            ret = gs.look_up(state, 2)
            if ret is not None:
                entry[cm.province_e] = ret['name_e']
        else:
            entry[cm.province_e] = state

        entry[cm.zip_code] = s['zip']
        entry[cm.tel] = s['phone']
        entry[cm.hours] = s['hours']
        entry[cm.lat] = string.atof(s['latitude'])
        entry[cm.lng] = string.atof(s['longitude'])
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')

    db.disconnect_db()
    return store_list
Esempio n. 53
0
def fetch_stores(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    province_list = [{
        cm.province_c: m[1].strip().upper(),
        cm.url: m[0].strip()
    } for m in re.findall(
        ur'<li><a href="#(fragment-\d+)"><span>(.+?)</span></a></li>', html)]

    comment_pat = re.compile(ur'<!--.*?-->', re.S)
    store_list = []

    for p in province_list:
        start = html.find('<div id="%s">' % p[cm.url])
        if start == -1:
            continue
        p_sub, start, end = cm.extract_closure(html[start:], ur'<tbody>',
                                               ur'</tbody>')
        p_sub = re.sub(comment_pat, '', p_sub)

        city_c = ''
        city_e = ''
        while True:
            s_sub, start, end = cm.extract_closure(p_sub, ur'<tr>', ur'</tr>')
            if end == 0:
                break
            p_sub = p_sub[end:]
            if u'城市' in s_sub and u'店铺名称' in s_sub:
                continue

            term_list = re.findall(ur'<td.*?>(.+?)</td>', s_sub)
            if len(term_list) < 3:
                continue

            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])

            if len(term_list) == 4:
                city_c = term_list[0].strip()
                ret = gs.look_up(city_c, 3)
                if ret is not None:
                    city_e = ret['name_e']
                    city_c = ret['name_c']
                offset = 1
            else:
                offset = 0

            entry[cm.name_c] = cm.html2plain(term_list[offset + 0]).strip()
            entry[cm.tel] = cm.html2plain(term_list[offset + 1]).strip()
            entry[cm.addr_e] = cm.reformat_addr(term_list[offset + 2]).strip()
            entry[cm.country_e] = 'CHINA'
            entry[cm.continent_e] = 'ASIA'

            p_name_c = p[cm.province_c]
            p_name_e = ''
            ret = gs.look_up(p_name_c, 2)
            if ret is not None:
                p_name_c = ret['name_c']
                p_name_e = ret['name_e']
            cm.update_entry(
                entry, {
                    cm.province_e: p_name_e,
                    cm.province_c: p_name_c,
                    cm.city_e: city_e,
                    cm.city_c: city_c
                })
            entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (
                data['brandname_e'], data['brand_id'], entry[cm.name_e],
                entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
            store_list.append(entry)
            db.insert_record(entry, 'stores')

    return store_list
Esempio n. 54
0
            return []
    else:
        body = data['body']

    store_list = []
    city = data['city']
    if city == '':
        m = re.search(ur'<span id="m_sthead"\s*>(.+?)</span>', body)
        if m is not None:
            city = cm.reformat_addr(m.group(1))
    city = city.replace(u'市', u'').strip()
    for m in re.finditer(ur'<span id="m_stname"[^<>]*>(.+?)</span>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.province_c] = data['province']
        ret = gs.look_up(data['province'], 2)
        if ret is not None:
            entry[cm.province_e] = ret['name_e']
        entry[cm.city_c] = city
        ret = gs.look_up(city, 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']

        entry[cm.name_e] = cm.reformat_addr(m.group(1))

        m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end():])
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(',')
            tel = cm.extract_tel(addr_list[-1]).strip()
            if tel != '':
                del addr_list[-1]