Ejemplo n.º 1
0
    def get_detailed_store(html, store_cat):
        store_list = []
        start = 0
        while True:
            sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>")
            if end == 0:
                break

            # 得到单个门店的页面代码
            html = html[end:]
            entry = common.init_store_entry(brand_id, brandname_e, brandname_c)

            m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html)
            if len(m) > 0:
                entry[common.name_e] = common.reformat_addr(m[0])
            m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S)
            if len(m) > 0:
                addr = common.reformat_addr(m[0])
                # 最后一行是否为电话号码?
                terms = addr.split(", ")
                tel = common.extract_tel(terms[-1])
                if tel != "":
                    addr = ", ".join(terms[:-1])
                    entry[common.tel] = tel
                entry[common.addr_e] = addr

            # 获得门店类型
            # store_type = [store_cat]
            type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>")
            if type_end != 0:
                store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)]
                store_type.insert(0, store_cat)
                entry[common.store_type] = ", ".join(store_type)
            else:
                entry[common.store_type] = store_cat

            # 获得经纬度
            m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lat] = string.atof(m[0])
            m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lng] = string.atof(m[0])

            entry[common.city_e] = common.extract_city(data[common.city_e])[0]
            entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper()
            gs.field_sense(entry)

            print "%s: Found store: %s, %s (%s, %s, %s)" % (
                brandname_e,
                entry[common.name_e],
                entry[common.addr_e],
                entry[common.city_e],
                entry[common.country_e],
                entry[common.continent_e],
            )
            db.insert_record(entry, "stores")
            store_list.append(entry)

        return store_list
Ejemplo n.º 2
0
def fetch_cities(data):
    url = data['home_url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, 'canali_log.txt')
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find(u'<nav class="countrySelector">')
    if start == -1:
        cm.dump('Error occured in fetching country list: %s' % url, 'canali_log.txt')
    body = cm.extract_closure(body[start:], ur'<nav\b', ur'</nav>')[0]

    results = []
    for m in re.finditer(ur'<li><a href=".+?">(.+?)</a>', body):
        country = m.group(1).strip().upper()
        sub = cm.extract_closure(body[m.end():], ur'<ul\b', ur'</ul>')[0]
        for m1 in re.findall(ur'<li><a class=".+?" href="(.+?)">(.+?)</a></li>', sub):
            d = data.copy()
            d['country'] = country
            d['url'] = data['host'] + m1[0]
            d['city'] = m1[1].strip().upper()
            results.append(d)
Ejemplo n.º 3
0
def fetch_stores(data):
    param = {'action': 'getStoresFromAjax', 'country': data['country_code'],
             'region': data['city'], 'collection': ''}
    url = data['url']
    try:
        body = cm.post_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for m1 in re.finditer(ur'<div class="shop-type-container">', body):
        sub = cm.extract_closure(body[m1.start():], ur'<div\b', ur'</div>')[0]
        store_class = ''
        m2 = re.search(ur'<div class="shop-type-title">(.+?)</div>', sub, re.S)
        if m2 is not None:
            store_class = cm.reformat_addr(m2.group(1))

        for m2 in re.finditer(ur'<div class="shop"', sub):
            store_sub = cm.extract_closure(sub[m2.start():], ur'<div\b', ur'</div>')[0]
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.store_class] = store_class
            entry[cm.country_e] = data['country_code']
            entry[cm.city_e] = cm.extract_city(data['city'])[0]

            m3 = re.search(ur'loadStore\((\d+)\s*,\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)\)', store_sub)
            if m3 is not None:
                data['store_id'] = string.atoi(m3.group(1))
                entry[cm.lat] = string.atof(m3.group(2))
                entry[cm.lng] = string.atof(m3.group(3))
                entry[cm.store_type] = ', '.join(get_detail(data))

            m3 = re.search(ur'<div class="shop-name shop-details shop-main-name">([^<>]+)</div>', store_sub)
            if m3 is not None:
                entry[cm.name_e] = m3.group(1).strip()
            addr_list = []
            m3 = re.search(ur'<div class="shop-street shop-details">([^<>]+)</div>', store_sub)
            if m3 is not None:
                addr_list.append(cm.reformat_addr(m3.group(1)))
            m3 = re.search(ur'<div class="shop-city shop-details">([^<>]+)</div>', store_sub)
            if m3 is not None:
                tmp = cm.reformat_addr(m3.group(1))
                m3 = re.search(ur'(\d{4,})', tmp)
                if m3 is not None:
                    entry[cm.zip_code] = m3.group(1).strip()
                addr_list.append(tmp)
            entry[cm.addr_e] = ', '.join(addr_list)

            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
            cm.dump('(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.city_e],
                                                                    entry[cm.country_e],
                                                                    entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
Ejemplo n.º 4
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m1 in re.finditer(ur'<lignecountry\s+titre\s*=\s*"([^"]+)"', body):
        country = m1.group(1).strip().upper()
        if country == 'U.S.A.':
            country = 'US'
        sub_country = cm.extract_closure(body[m1.start():], ur'<lignecountry\b', ur'</lignecountry>')[0]
        for m2 in re.finditer(ur'<lignecity\s+titre\s*=\s*"([^"]+)"', sub_country):
            city = m2.group(1).strip().upper()
            sub_city = cm.extract_closure(sub_country[m2.start():], ur'<lignecity\b', ur'</lignecity>')[0]
            m3 = re.search(ur'<!\[CDATA\[(.+?)\]\]>', sub_city, re.S)
            if m3 is None:
                continue
            sub_city = m3.group(1)
            store_subs = re.split(ur'<\s*h2\s*>\s*LANVIN BOUTIQUE\s*<\s*/h2\s*>', sub_city)
            for s in store_subs:
                if s.strip() == '':
                    continue
                m4 = re.search(ur'<p>(.+?)</p>', s, re.S)
                if m4 is None:
                    continue
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.country_e] = country
                entry[cm.city_e] = city
                s = m4.group(1)
                m4 = re.search(ur'(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.addr_e] = cm.reformat_addr(m4.group(1))
                m4 = re.search(ur'Phone:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.tel] = cm.reformat_addr(m4.group(1).strip())
                m4 = re.search(ur'Boutique Hours:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.hours] = cm.reformat_addr(m4.group(1).strip())
                m4 = re.search(ur'Products available:(.+?)\n\s*\n', s, re.S)
                if m4 is not None:
                    entry[cm.store_type] = m4.group(1).strip()
                m4 = re.search(ur'Email:\s*<a href="mailto:([^"]+)">', s)
                if m4 is not None:
                    entry[cm.email] = m4.group(1).strip()
                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                gs.field_sense(entry)
                cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                    entry[cm.name_e], entry[cm.addr_e],
                                                                    entry[cm.country_e],
                                                                    entry[cm.continent_e]), log_name)
                db.insert_record(entry, 'stores')
                store_list.append(entry)
Ejemplo n.º 5
0
def fetch_store_details(data):
    url = data['host'] + data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    start = body.find(ur'<h3>available in store</h3>')
    if start != -1:
        type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]
        entry[cm.store_type] = ', '.join(
            cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S))

    start = body.find(ur"<div class='gmap_info_box'")
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table']
    entry[cm.name_e] = cm.html2plain(raw['name'])
    entry[cm.city_e] = data['city'].strip().upper()
    entry[cm.country_e] = data['country'].strip().upper()
    # entry[cm.store_type] = data['store_type']
    entry[cm.addr_e] = cm.reformat_addr(raw['address'])
    m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S), raw['phone'])
    if m is not None:
        entry[cm.tel] = m.group(1).strip()
        entry[cm.fax] = m.group(2).strip()
    else:
        m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
    entry[cm.hours] = raw['hours']
    if raw['lat'] is not None and raw['lat'] != '':
        entry[cm.lat] = string.atof(raw['lat'])
    if raw['lng'] is not None and raw['lng'] != '':
        entry[cm.lat] = string.atof(raw['lng'])
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None:
        entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')
    return [entry]
Ejemplo n.º 6
0
def parse_store(data, body=None):
    if body is None:
        url = data['url']
        try:
            body = cm.post_data(url)
        except Exception:
            cm.dump('Error in fetching stores: %s' % url, log_name)
            return []

    start = body.find(ur'jQuery.extend(Drupal.settings,')
    latlng_map = {}
    if start != -1:
        for item in json.loads(cm.extract_closure(body[start:], ur'\{', ur'\}')[0])['getlocations']['key_1']['latlons']:
            latlng_map[cm.reformat_addr(item[3])] = {'lat': string.atof(item[0]), 'lng': string.atof(item[1])}
Ejemplo n.º 7
0
def fetch_cities(data):
    sql = "SELECT CityUP FROM %s WHERE Country='%s' ORDER BY CityUP ASC" % (tableid, data['country_code'])
    url = (u'%s?sql=%s&key=%s' % (data['data_url'], sql, queryUrlTail)).replace(u' ', u'%20')
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching cities: %s' % url, log_name)
        return []

    results = []
    for c in set([tmp[0] for tmp in json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['rows']]):
        d = data.copy()
        d['city'] = c
        results.append(d)
Ejemplo n.º 8
0
def fetch_store_details(data):
    url = '%s/%d' % (data['url'], data['store_id'])
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    m = re.search(ur'<h1 class="with-back-option">\s*([^<>]+)\s*[<>]', body)
    if m is not None:
        entry[cm.name_e] = m.group(1).strip()

    start = body.find(ur'<div class="store-details">')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
        addr = cm.extract_closure(sub, ur'<p\b', ur'</p>')[0]
        m = re.search(ur'<span class="locality">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.city_e] = m.group(1).split(',')[0].strip().upper()
        m = re.search(ur'<span class="postal-code">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.zip_code] = m.group(1).strip()
        m = re.search(ur'<span class="country-name">([^<>]+?)</span>', addr)
        if m is not None:
            entry[cm.country_e] = m.group(1).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr)

    start = body.find(ur'<div class="contact">')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
        m = re.search(ur'<span class="tel">(.+?)</span>', sub)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'<span class="fax">(.+?)</span>', sub)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
        m = re.search(ur'<a href="mailto:([^"]+)">Email</a>', sub)
        if m is not None:
            entry[cm.email] = m.group(1).strip()

    start = body.find(ur'<h3>Opening hours</h3>')
    if start != -1:
        tmp = []
        sub = cm.extract_closure(body[start:], ur'<table>', ur'</table>')[0]
        for m in re.findall(ur'<t[hd][^<>]*>([^<>]+)</t[hd]>', sub):
            tmp.append(m)
        entry[cm.hours] = ' '.join(tmp)
Ejemplo n.º 9
0
def fetch_states(data):
    global national_added

    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching states: %s' % url, log_name)
        return []

    national_added = False

    m = re.search(ur'Choose a (state|region|province)', body)
    if m is None:
        d = data.copy()
        d['state'] = ''
        return [d]

    body = cm.extract_closure(body[m.start():], ur'<ul>', ur'</ul>')[0]
    results = []
    for m in re.findall(ur'<a href="([^"]+)">([^<>]+)</a>', body):
        d = data.copy()
        d['url'] = data['host'] + m[0]
        d['state'] = cm.html2plain(m[1]).strip().upper()
        results.append(d)
Ejemplo n.º 10
0
def fetch_cities(data):
    """
    城市列表
    :param data:
    """
    html = data['html']

    store_list = []
    while True:
        m = re.search(ur'<li class="expanded"><a href=".*?">(.+?)</a><br\s*?/>', html)
        if m is None:
            break
        html = html[m.start():]

        sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>')
        html = html[end:]

        d = data.copy()
        d['html'] = sub[len(m.group(0)):-len('</li>')]
        terms = m.group(1).strip().upper().split(' ')
        if len(terms) > 1 and cm.is_chinese(terms[-1]):
            d['city_c'] = terms[-1].strip()
            terms = terms[:-1]
        d['city_e'] = ' '.join(terms)
        if d['country_e'] == 'USA':
            m1 = re.search(ur'([A-Z]{2})\s*-\s*(.+)', d['city_e'])
            if m1:
                d['city_e'] = m1.group(2).strip()
                d['province_e'] = m1.group(1).strip()
        print 'Processing %s' % d['city_e']
        store_list.extend(fetch_stores(d))

    return store_list
Ejemplo n.º 11
0
    def fetch_color(cls, response, spider=None):
        sel = Selector(response)

        region = None
        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        colors = []
        if region == 'cn':
            try:
                tmp = sel.xpath('//select[@class="select-color"]/option//a[@href]/text()').extract()
                if tmp:
                    colors = [cls.reformat(val) for val in tmp]
            except(TypeError, IndexError):
                pass
        else:
            try:
                idx = response.body.find('var productURLs')
                data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"'))
                for color_key in data:
                    tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key))
                    if not tmp:
                        continue
                    color_node = tmp[0]
                    # 是否为当前选择的颜色?
                    if color_node.xpath('@selected'):
                        tmp = color_node.xpath('text()').extract()
                        if tmp:
                            colors = [cls.reformat(tmp[0])]
            except ValueError:
                pass

        return colors
Ejemplo n.º 12
0
def fetch_countries(data):
    url = data['url']
    try:
        body, cookie = cm.get_data_cookie(url)
    except Exception:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []

    m = re.search(ur'name="form_build_id" value="(.+?)"', body)
    if m is None:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []
    data['form_build_id'] = m.group(1)
    if cookie is None:
        data['cookie'] = ''
    else:
        data['cookie'] = cookie

    start = body.find(ur'<select id="edit-countries"')
    if start == -1:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<select\b', ur'</select>')[0]

    results = []
    for m in re.findall(ur'<option.+?value="([A-Z]{3})".*?>(.+?)</option>', body):
        d = data.copy()
        d['country_code'] = m[0]
        d['country'] = m[1].strip()
        print 'Country: %s, %s' % (d['country_code'], d['country'])
        results.append(d)
Ejemplo n.º 13
0
def fetch_stores(data):
    body = data['body']
    start = body.find(u'<ul class="storelist storelist_%s' % data['code'])
    if start == -1:
        cm.dump('Error in finding stores for %s' % data['code'])
        return []
    body = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]

    store_list = []
    for m in re.findall(ur'<li class="sitem">(.+?)</li>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'<h3>(.+?)</h3>', m)
        if m1 is not None:
            entry[cm.name_c] = m1.group(1).strip()
        m1 = re.search(ur'<div class="addr">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.addr_e] = m1.group(1).replace(u'地址:', '').replace(u'地址:', '').strip()
        m1 = re.search(ur'<div class="tel">(.+?)</div>', m)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).replace(u'电话:', '').replace(u'电话:', '').strip()
        entry[cm.city_c] = data['city']
        ret = gs.look_up(data['city'], 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']
            entry[cm.city_c] = ret['name_c']
            if ret['province'] != '':
                entry[cm.province_e] = ret['province']['name_e']
        entry[cm.country_e] = u'CHINA'
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'canali_log.txt')
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Ejemplo n.º 14
0
def get_frag_countries(url):
    # 获得国家代码
    """
    获得国家的名字和代码
    :rtype : [{'id':**, 'country':**}, ...]
    :param url:
    :return:
    """
    try:
        html = common.get_data(url)
    except Exception:
        print 'Error occured: %s' % url_fragrance
        dump_data = {'level': 1, 'time': common.format_time(), 'data': {'url': url_fragrance},
                     'brand_id': brand_id}
        common.dump(dump_data)
        return [], False

    start = html.find('<select name="country" id="id_country">')
    if start == -1:
        return [], False
    sub, s, e = common.extract_closure(html[start:], ur'<select\b', ur'</select>')
    if e == 0:
        return [], False
    return [{'id': string.atoi(m[0]), 'country': m[1].strip().upper()}
            for m in re.findall(ur'<option value="(\d+)".*?>(.+?)</option>', sub)]
Ejemplo n.º 15
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e=ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Ejemplo n.º 16
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<item id="\d+">', body):
        sub = cm.extract_closure(body[m.start():], ur'<item\b', ur'</item>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'<country>([^<>]+)</country>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            for v in tmp:
                ret = gs.look_up(v.strip().upper(), 1)
                if ret is not None:
                    entry[cm.country_e] = ret['name_e']
                    break
        m1 = re.search(ur'<city>([^<>]+)</city>', sub)
        if m1 is not None:
            val = cm.reformat_addr(m1.group(1))
            if entry[cm.country_e] == 'UNITED STATES':
                tmp_list = tuple(tmp.strip() for tmp in cm.reformat_addr(val).strip(','))
                if len(tmp_list) == 2:
                    if re.search('[A-Z]{2}', tmp_list[1]):
                        entry[cm.province_e] = tmp_list[1]
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<brands>([^<>]+)</brands>', sub)
        if m1 is not None:
            tmp = m1.group(1).split('/')
            brand_list = []
            for v in tmp:
                if v.strip() != '':
                    brand_list.append(v)
            entry[cm.store_type] = ', '.join(brand_map[key] for key in brand_list)
        m1 = re.search(ur'<name>([^<>]+)</name>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()
        m1 = re.search(ur'<address>([^<>]+)</address>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<tel>([^<>]+)</tel>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'sll=(-?\d+\.\d+),(-?\d+\.\d+)', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
            gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Ejemplo n.º 17
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="country">Choose a country</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    country_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        d = data.copy()
        country_e = cm.html2plain(m[1]).strip().upper()
        ret = gs.look_up(country_e, 1)
        if ret is not None:
            country_e = ret['name_e']
        d['country_e'] = country_e
        d['province_e'] = ''
        d['url'] = data['host'] + m[0]
        country_list.append(d)
    return country_list
Ejemplo n.º 18
0
    def fetch_name(cls, response, spider=None):
        sel = Selector(response)

        region = None
        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        name = None
        if region != 'cn':
            try:
                # 商品信息在var productJSONObject中
                mt = re.search(r'var\s+productJSONObject\s*=', response.body)
                if mt:
                    data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"',
                                                                                                        '"').replace(r"\'",
                                                                                                                     "'"))
                    if 'productName' in data:
                        name = cls.reformat(data['productName'])
            except(TypeError, IndexError, ValueError):
                pass
        else:
            try:
                tmp = sel.xpath('//div[@id="hidden_sku_value"]/input[@id="title" and @value]')
                if tmp:
                    name = unicodify(tmp[0]._root.attrib['value'])
            except(TypeError, IndexError):
                pass

        return name
Ejemplo n.º 19
0
def get_store_list(data):
    """
    返回店铺列表,其中店铺包含国家信息。
    :rtype : [{'name':'store name', 'url':'http://...', 'city':'NEW YORK', 'country:':'AUSTRALIA'}, ...]
    :param data:
    """
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<ul>\s+?<h3 class="country-name">(.+?)</h3>', html, re.S):
        sub, start, end = cm.extract_closure(html[m.start():], ur'<ul>', ur'</ul>')
        if end == 0:
            continue
            # 得到不同国家的分割
        splits = [[m1.start(), m1.group(1)] for m1 in re.finditer(ur'<h3 class="country-name">(.+?)</h3>', sub)]
        splits.append([-1, ''])
        for i in xrange(len(splits) - 1):
            # 在同一个国家下寻找
            sub1 = sub[splits[i][0]:splits[i + 1][0]]
            country = splits[i][1].upper()
            for m1 in re.findall(ur'<li>\s*?<a href="(http://us.christianlouboutin.com/us_en/storelocator/\S+?)">'
                                 ur'(.+?)</a>,(.+?)</li>', sub1):
                store_list.append({'name': m1[1].strip(), 'url': m1[0], 'city': m1[2].strip().upper(),
                                   'country': country})
Ejemplo n.º 20
0
def fetch_countries(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = html.find('<select name="country" id="inp-country"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<select\b', ur'</select>')
    if end == 0:
        return []
    country_list = []
    for m in re.findall(ur'<option value="([A-Z]{2})">(.*?)</option>', sub):
        d = data.copy()
        d['country_code'] = m[0]
        d[cm.country_c] = m[1].strip()
        for key in [cm.country_e, cm.continent_e, cm.continent_c]:
            d[key] = ''
        ret = gs.look_up(d['country_code'], 1)
        if ret is not None:
            d[cm.country_e] = ret['name_e']
            d[cm.country_c] = ret['name_c']
            d[cm.continent_c] = ret['continent']['name_c']
            d[cm.continent_e] = ret['continent']['name_e']

        country_list.append(d)
Ejemplo n.º 21
0
    def fetch_color(cls, response, spider=None):
        sel = Selector(response)

        region = None
        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        colors = []
        if region != 'cn':
            try:
                # 商品信息在var productJSONObject中
                mt = re.search(r'var\s+productJSONObject\s*=', response.body)
                if mt:
                    data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"',
                                                                                                        '"').replace(r"\'", "'"))
                    colors = [cls.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches']
                              if 'color' in swatch]
            except (KeyError, ValueError, TypeError, IndexError):
                colors = None
                pass
        else:
            # TODO 没找到原爬虫解析中国的单品颜色的代码
            pass

        return colors
Ejemplo n.º 22
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find('<div class="store_locator')
    if start == -1:
        print 'Failed processing %s' % url
        return []
    sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S)
    if m is not None:
        addr_list = cm.reformat_addr(m.group(1)).split(', ')
        ret = cm.extract_tel(addr_list[-1])
        if ret != '':
            entry[cm.tel] = ret
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

    addr_text=sub[m.end():]
    m = re.search(ur'<div class="title locator">', addr_text)
    if m is not None:
        tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    entry[cm.country_e] = data['country_code']
    entry[cm.city_e] = data['city']
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
Ejemplo n.º 23
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
Ejemplo n.º 24
0
    def parse_product_details(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)

        metadata['url'] = response.url

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        image_urls = []
        mt = re.search(r'var\s+jsoninit_item', response.body)
        if mt:
            idx = response.body[mt.regs[0][1]:].find('AVAILABLEZOOM')
            if idx != -1:
                idx += mt.regs[0][1]
                tmp = json.loads(
                    cm.extract_closure(response.body[idx:], '{',
                                       '}')[0].replace("'", '"'))
                for c in tmp:
                    model = metadata['model']
                    if re.search(c + '$', model, flags=re.I):
                        # 找到放大的图像
                        image_urls = [
                            str.format('http://cdn.yoox.biz/{0}/{1}_{2}.jpg',
                                       model[:2], model, val) for val in tmp[c]
                        ]

        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        return item
Ejemplo n.º 25
0
def fetch_uk(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print "Error in finding %s stores" % data["name"]
        return []
    body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>")
    if end == 0:
        print "Error in finding %s stores" % data["name"]
        return []

    store_list = []
    for m in re.findall(ur"<div>\s*(.+?)\s*</div>", body, re.S):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.country_e] = data["name"]

        addr_list = re.findall(ur"<p>\s*(.+?)\s*</p>", m)
        tel = cm.extract_tel(addr_list[-1])
        if tel != "":
            entry[cm.tel] = tel
            del addr_list[-1]

        if data["name"] == "AUSTRALIA":
            country, province, city = gs.addr_sense(", ".join(addr_list), data["name"])
            if city is not None:
                entry[cm.city_e] = city
            if province is not None:
                entry[cm.province_e] = province
        else:
            city = addr_list[-2].strip().upper()
            entry[cm.city_e] = city
            ret = gs.look_up(city, 3)
            if ret is not None and ret["country"]["name_e"] == gs.look_up("UK", 1)["name_e"]:
                entry[cm.city_e] = ret["name_e"]
            entry[cm.zip_code] = addr_list[-1].strip().upper()
        entry[cm.addr_e] = ", ".join(addr_list)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print "(%s / %d) Found store: %s, %s (%s, %s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.city_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )

        db.insert_record(entry, "stores")
        store_list.append(entry)
Ejemplo n.º 26
0
def fetch_stores(data):
    url = data['url']
    param = {'country_id': data['country_code'], 'city': '', 'label_id': '', 'lang': 'en'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []

    start = body.find(ur'<stores>')
    if start == -1:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<stores>', ur'</stores>')[0]

    store_list=[]
    for m in re.findall(ur'<store\b[^<>]+>(.+?)</store>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country_code']
        m1 = re.search(ur'<name>(.+?)</name>', m)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1).strip())
        m1 = re.search(ur'<address>(.+?)</address>', m)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1).strip())
        m1 = re.search(ur'<city>(.+)</city>', m)
        if m1 is not None:
            entry[cm.city_e] = cm.extract_city(m1.group(1))[0]
        m1 = re.search(ur'<zip>(.+?)</zip>', m)
        if m1 is not None:
            entry[cm.zip_code] = m1.group(1).strip()
        m1 = re.search(ur'<tel>(.+?)</tel>', m)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'<fax>(.+?)</fax>', m)
        if m1 is not None:
            entry[cm.fax] = m1.group(1).strip()
        m1 = re.search(ur'<email>(.+?)</email>', m)
        if m1 is not None:
            entry[cm.email] = m1.group(1).strip()
        m1 = re.search(ur'<link>(.+?)</link>', m)
        if m1 is not None:
            entry[cm.url] = m1.group(1).strip()

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
Ejemplo n.º 27
0
def fetch_store_list(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []

    start = body.find(ur"'country_select'")
    if start == -1:
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []
    country_raw = json.loads(cm.extract_closure(body[start:], ur'\[', ur'\]')[0])
    country_map = {}
    for c in country_raw:
        country_map[string.atoi(c['id'])] = c['name']

    start = body.find(ur'loadQuickSearch')
    if start == -1:
        cm.dump('Error in fetching store list: %s' % url, log_name)
        return []
    raw = json.loads(cm.extract_closure(body[start:], ur'\[', ur'\]')[0])

    city_map = {}
    results = []
    for item in raw:
        if item['type'] == 'city':
            country = country_map[string.atoi(item['parent_id'])]
            city_map[string.atoi(item['id'])] = {'name': item['name'], 'country': country}

    for item in raw:
        if item['type'] == 'store':
            d = data.copy()
            d['name'] = item['name']
            d['city'] = dict(city_map[string.atoi(item['parent_id'])])
            d['url'] = data['url'] + item['store_url_alias']
            d['id'] = string.atoi(item['id'])
            results.append(d)
        elif item['type'] == 'city':
            continue

    return results
Ejemplo n.º 28
0
def fetch_cities(data):
    # url = data['post_url']
    # try:
    #     action=yoox_storelocator_change_country&country_id=3125&dataType=JSON
    #     js = json.loads(cm.post_data(url, {'action': 'yoox_storelocator_change_country',
    #                                        'country_id': ,
    #                                        'retail_type': data['retail_type']}).decode('unicode_escape'))
    # except Exception:
    #     print 'Error occured in getting country list: %s' % url
    #     dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
    #     cm.dump(dump_data)
    #     return []

    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    # 建立国家和城市列表
    country_map = {}
    city_map = {}

    start = html.find('<div id="storelocator-box-select-country"')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>')
    for m1 in re.finditer(ur'<a href=".+?" class="depth-1" data-store-id="(\d+)">(.+?)</a>', sub):
        country_id = string.atoi(m1.group(1))
        country_e = m1.group(2).strip().upper()
        country_map[country_id] = country_e

        city_sub, s1, e1 = cm.extract_closure(sub[m1.end():], ur'<ul\b', ur'</ul>')
        for m2 in re.findall(ur'<li class=".+?"><a href=".+?" class="depth-2" data-store-id="(\d+)">(.+?)</a></li>',
                             city_sub):
            city_id = string.atoi(m2[0])
            city_e = m2[1].strip().upper()
            city_map[city_id] = {'city_e': city_e, 'parent': country_id}
Ejemplo n.º 29
0
    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        region = None
        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        old_price = None
        new_price = None
        if region != 'cn':
            # 商品信息在var productJSONObject中
            mt = re.search(r'var\s+productJSONObject\s*=', response.body)
            if mt:
                try:
                    data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"',
                                                                                                        '"').replace(r"\'",
                                                                                                                 "'"))
                except(TypeError, IndexError, ValueError):
                    return ret
                # 价格信息
                try:
                    for item in data['swatchGroup']['swatches']:
                        if 'listPrice' in item:
                            old_price = cls.reformat(item['listPrice'])
                            if 'unitPrice' in item:
                                new_price = cls.reformat(item['unitPrice'])
                            break
                except KeyError:
                    pass
        else:
            tmp = sel.xpath('//div[@id="hidden_sku_value"]/input[@id="skuCode" and @value]')
            sku_code = None
            if tmp:
                sku_code = tmp[0]._root.attrib['value']
            if sku_code:
                # 价格信息
                return Request(url=cls.spider_data['price_url'][region], method='POST', dont_filter=True,
                               body=str.format('skuCode={0}', sku_code), callback=cls.fetch_price_request,
                               errback=spider.onerror,
                               headers={'Content-Type': 'application/x-www-form-urlencoded',
                                        'Accept-Encoding': 'gzip,deflate,sdch',
                                        'X-Requested-With': 'XMLHttpRequest', 'Accept': '*/*'},
                               meta=response.meta)

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret
Ejemplo n.º 30
0
def fetch_store_list(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []

    start = body.find(ur"<div class='store-country'>")
    if start == -1:
        cm.dump('Error in fetching store lists: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    start_stores = body.find(ur'<h3><a href="/store-locator/index">Stores</a></h3>')
    start_outlets = body.find(ur"<h3 class='outlets'>")
    store_sub = body[start_stores:start_outlets]
    outlet_sub = body[start_outlets:]

    results = []
    for m1 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="country">([^<>]+)</a>', store_sub):
        country_id = string.atoi(m1.group(1))
        country = m1.group(2).strip()
        sub1 = cm.extract_closure(store_sub[m1.end():], ur'<ul>', ur'</ul>')[0]
        for m2 in re.finditer(ur'<a [^<>]*data-id="([^"]+)"[^<>]*data-type="city">([^<>]+)</a>', sub1):
            city_id = string.atoi(m2.group(1))
            city = m2.group(2).strip()
            sub2 = cm.extract_closure(sub1[m2.end():], ur'<ul>', ur'</ul>')[0]
            for m3 in re.finditer(ur'<a href="([^"]+)"[^<>]*data-id="([^"]+)"[^<>]*data-type="store">([^<>]+)</a>',
                                  sub2):
                d = data.copy()
                d['country_id'] = country_id
                d['country'] = country
                d['city_id'] = city_id
                d['city'] = city
                d['url'] = m3.group(1).strip()
                d['store_id'] = string.atoi(m3.group(2))
                d['store'] = cm.html2plain(m3.group(3).strip())
                # d['store_type'] = 'store'
                results.append(d)
Ejemplo n.º 31
0
def get_subcat(html, pat):
    """
    比如,dunhill需要单独拿出来
    :param html:
    :param pat:
    """
    it = re.finditer(pat, html)
    try:
        m = it.next()
        sub_html, start, end = common.extract_closure(html[m.start() :], ur"<ul\b", ur"</ul>")
        return sub_html
    except StopIteration:
        return ""
Ejemplo n.º 32
0
def fetch_countries(data):
    url = data['home_url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = html.find(u'<div id="block-ps-shop-locator-shop-locator-filters"')
    if start == -1:
        return []
    html, start, end = cm.extract_closure(html[start:], ur'<div\b', ur'</div>')
    if end == 0:
        return []

    country_list = []

    for m in re.finditer(ur'<h3>(.+?)</h3>', html):
        continent_e = m.group(1).strip().upper()
        if continent_e == u'UK' and False:
            d = data.copy()
            d[cm.continent_e] = u'EUROPE'
            d[cm.country_e] = u'UNITED KINGDOM'
            d[cm.url] = data['host'] + '/uk-en/shop-locator/gb/all'
            country_list.append(d)
        else:
            sub, start, end = cm.extract_closure(html[m.end():], ur'<ul\b', ur'</ul>')
            if end == 0:
                continue
                #<a href="/uk-en/shop-locator/fr/all">France</a>
            for m1 in re.findall(ur'<a href="(.+?)">(.+?)</a>', sub):
                d = data.copy()
                d[cm.continent_e] = continent_e
                d[cm.country_e] = m1[1].strip().upper()
                d[cm.url] = data['host'] + m1[0]
                if d[cm.country_e]=='SINGAPORE':
                    country_list.append(d)
Ejemplo n.º 33
0
def fetch_world(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print "Error in finding %s stores" % data["name"]
        return []
    body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>")
    if end == 0:
        print "Error in finding %s stores" % data["name"]
        return []

    idx_list = []
    for m in re.finditer(ur"<h2>(.+?)</h2>", body):
        idx_list.append({"idx": m.end(), "name": m.group(1).strip().upper()})
Ejemplo n.º 34
0
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    start = body.find(ur'<div class="col first" itemprop="address"')
    if start == -1:
        cm.dump('Error in fetching store details: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

    addr = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
    m = re.search(ur'<span itemprop="postalCode">([^<>]+)</span>', addr, re.S)
    if m is not None:
        entry[cm.zip_code] = m.group(1).strip()
    entry[cm.addr_e] = cm.reformat_addr(addr)

    start = body.find(ur'<div class="col" itemprop="contactPoints"')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<p>', ur'</p>')[0]
        m = re.search(ur'<span itemprop="telephone">([^<>]+)</span>', sub, re.S)
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(ur'<span itemprop="faxNumber">([^<>]+)</span>', sub, re.S)
        if m is not None:
            entry[cm.fax] = m.group(1).strip()

    start = body.find(ur'<h2>opening hours</h2>')
    if start != -1:
        sub = cm.extract_closure(body[start:], ur'<table\b', ur'</table>')[0]
        tmp = []
        for m in re.findall(ur'<td>(.+?)</td>', sub):
            tmp.append(cm.html2plain(m).strip())
        entry[cm.hours] = ' '.join(tmp)
Ejemplo n.º 35
0
def fetch_indv(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    body, start, end = cm.extract_closure(body, ur'<article\b', ur'</article>')
    tmp = []
    for m in re.finditer(ur'<h2>\s*(.+?)\s*</h2>', body):
        tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()})
Ejemplo n.º 36
0
def fetch_hk(data):
    loc_list = ('Hong Kong', 'Kowloon', 'Macau', 'New Territories')
    url = 'http://levi.com.hk/hk/storelocator'
    store_list = []
    for loc in loc_list:
        param = {'loc': loc}
        try:
            body = cm.get_data(url, param)
        except Exception, e:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue

        start = body.find(ur'<div id="addWrapper">')
        if start == -1:
            cm.dump('Error in fetching stores: %s' % param, log_name)
            continue
        sub = cm.extract_closure(body[start:], ur'<ul>', ur'</ul>')[0]
        for s in re.findall(ur'<li>(.+?)</li>', sub, re.S):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = 'MACAU' if loc == 'Macau' else 'HONG KONG'
            entry[cm.city_e] = entry[cm.country_e]

            m = re.search(ur'<div id="addStore">([^<>]+)', s)
            entry[cm.addr_e] = cm.html2plain(m.group(1)) if m else ''

            m = re.search(ur'<div id="addAddress">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'business hours?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.hours] = re.sub(pat, '', tmp).strip()

            m = re.search(ur'<div id="addPhone">([^<>]+)', s)
            tmp = cm.html2plain(m.group(1))
            pat = re.compile(ur'(tel|phone|telephone)?\s*[:\.]?\s*', re.I)
            if re.search(pat, tmp):
                entry[cm.tel] = re.sub(pat, '', tmp).strip()

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)

            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
Ejemplo n.º 37
0
def fetch_states(data):
    print '(%s/%d) Found country: %s' % (data['brandname_e'], data['brand_id'],
                                         data['country_e'])
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('<span class="state">Choose a state/provence</span>', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'<ul\b', r'</ul>')
    if end == 0:
        return []

    state_list = []
    for m in re.findall('<li><a .*?href="(.+?)">(.+?)</a></li>', sub):
        province_e = cm.html2plain(m[1]).strip().upper()
        if data['country_e'] == 'CHINA':
            # 去掉省中间的空格
            province_e = province_e.replace(' ', '')
        ret = gs.look_up(province_e, 2)
        if ret is not None:
            province_e = ret['name_e']
        d = data.copy()
        d['province_e'] = province_e
        d['url'] = data['host'] + m[0]
        state_list.append(d)

    return state_list
Ejemplo n.º 38
0
    def fetch_model(cls, response, spider=None):
        sel = Selector(response)

        model = None
        try:
            mt = re.search(r'var\s+jsoninit_dejavu\s*=\s*\{\s*ITEM:',
                           response.body)
            if not mt:
                return
            tmp = json.loads(
                cm.extract_closure(response.body[mt.regs[0][1]:], '{', '}')[0])
            if 'cod10' in tmp:
                model = tmp['cod10']
        except (TypeError, IndexError):
            pass

        return model.upper() if model else None
Ejemplo n.º 39
0
    def parse_details(self, response):
        metadata = response.meta['userdata']
        metadata['url'] = response.url
        sel = Selector(response)

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        image_urls = []
        for tmp in sel.xpath('//a[@href and @class="switchACss" and @rel]/@rel').extract():
            try:
                idx = tmp.find('largeimage')
                if idx == -1:
                    continue
                image_url = self.process_href(cm.extract_closure(tmp[idx:], "'", "'")[0][1:-1], response.url)
                if image_url not in image_urls:
                    image_urls.append(image_url)
            except (KeyError, ValueError, IndexError):
                continue

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['image_urls'] = image_urls
        item['metadata'] = metadata
        yield item
Ejemplo n.º 40
0
    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        old_price = None
        new_price = None
        try:
            mt = re.search(r'var\s+jsoninit_dejavu\s*=\s*\{\s*ITEM:',
                           response.body)
            if not mt:
                return
            tmp = json.loads(
                cm.extract_closure(response.body[mt.regs[0][1]:], '{', '}')[0])
            if 'price' in tmp:
                old_price = tmp['price']
        except (TypeError, IndexError):
            pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret
Ejemplo n.º 41
0
    def parse_details_us(self, response):
        metadata = response.meta['userdata']
        sel = Selector(response)
        metadata['url'] = response.url

        # 查找不同的颜色版本
        try:
            idx = response.body.find('var productURLs')
            data = json.loads(cm.extract_closure(response.body[idx:], '\{', '\}')[0].replace("'", '"'))
            for color_key in data:
                tmp = sel.xpath(str.format('//select/option[@value="{0}"]', color_key))
                if not tmp:
                    continue
                color_node = tmp[0]
                # 是否为当前选择的颜色?
                if not color_node.xpath('@selected'):
                    m = copy.deepcopy(metadata)
                    tmp = color_node.xpath('text()').extract()
                    if tmp:
                        m['color'] = [self.reformat(tmp[0])]
                    yield Request(url=self.process_href(data[color_key], response.url),
                                  callback=self.spider_data['callbacks'][metadata['region']][2],
                                  errback=self.onerr, meta={'userdata': m})
                else:
                    tmp = color_node.xpath('text()').extract()
                    if tmp:
                        metadata['color'] = [self.reformat(tmp[0])]
        except ValueError:
            pass

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = []
        for img_node in sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul'
                                  '/li[contains(@id,"productAngle")]//img[@src or @data-url]'):
            tmp = img_node.xpath('@data-url').extract()
            if tmp:
                image_urls.append(self.process_href(tmp[0], response.url))
            else:
                tmp = img_node.xpath('@src').extract()[0]
                a, b = os.path.splitext(tmp)
                image_urls.append(self.process_href(str.format('{0}_zoom{1}', a, b), response.url))

        #image_urls = [self.process_href(val, response.url) for val in
        #              sel.xpath('//div[contains(@class,"slider_selector") or @id="frg_thumb_list"]/ul'
        #                        '/li[contains(@id,"productAngle")]/img[@src and @data-url]/@data-url').extract()]
        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        yield item
Ejemplo n.º 42
0
    def parse_fashion(self, response):
        self.log(str.format('PARSE_FASHION: {0}', response.url),
                 level=log.DEBUG)
        mt = re.search(r'chanel\.com/([^/]+)/', response.url)
        region = None
        for a, b in self.spider_data['base_url'].items():
            if b == mt.group(1):
                region = a
                break
        if not region:
            self.log(str.format('NO VAR SETTINGS: {0}', response.url),
                     log.ERROR)
            return

        metadata = {
            'region': region,
            'brand_id': self.spider_data['brand_id'],
            'url': response.url,
            'tags_mapping': {}
        }

        mt = re.search(r'var\s+settings', response.body)
        if not mt:
            self.log(str.format('NO VAR SETTINGS: {0}', response.url),
                     log.ERROR)
            return
        content = cm.extract_closure(response.body[mt.start():], '{', '}')[0]
        try:
            data = json.loads(content)
        except ValueError:
            self.log(
                str.format('FAILED TO LOAD VAR SETTINGS: {0}', response.url),
                log.ERROR)
            return

        try:
            metadata['pricing_service'] = data['servicesURL']['pricing']
        except KeyError:
            metadata['pricing_service'] = None

        # images
        metadata['image_urls'] = set([])
        if 'detailsGridJsonUrl' in data['sectionCache']:
            temp = data['sectionCache']['detailsGridJsonUrl']
            if re.search(r'^http://', temp):
                url = temp
            else:
                url = str.format('{0}{1}', self.spider_data['hosts'][region],
                                 temp)

            try:
                proxy_enabled = self.crawler.settings.values['PROXY_ENABLED']
            except IndexError:
                proxy_enabled = False
            yield ProxiedRequest(url=url,
                                 meta={'userdata': metadata},
                                 callback=self.parse_json_request,
                                 dont_filter=True,
                                 errback=self.onerr,
                                 proxy_region=metadata['region'],
                                 proxy_enabled=proxy_enabled)
        else:
            for val in self.parse_json(metadata, data['sectionCache']):
                yield val
Ejemplo n.º 43
0
    def parse_details(self, response):
        # 确定所属国家
        region = None
        for tmp in self.spider_data['domains']:
            if self.spider_data['domains'][tmp] in response.url:
                region = tmp
                break
        if not region:
            return

        metadata = {'region': region, 'brand_id': self.spider_data['brand_id'], 'tags_mapping': {}, 'url': response.url}

        # 根据referer,获得category信息
        referer = response.request.headers['Referer']
        if referer not in self.url_cat_dict:
            return Request(url=referer, callback=self.parse_cat,
                           meta={'stash': response, 'coach-referer': referer, 'callback': self.parse_details},
                           errback=self.onerr, dont_filter=True)
        tag_list = self.url_cat_dict[referer]
        for tag in tag_list:
            metadata['tags_mapping'][tag['type']] = [{'name': tag['name'], 'title': tag['title']}]

        # 商品信息在var productJSONObject中
        mt = re.search(r'var\s+productJSONObject\s*=', response.body)
        if not mt:
            return
        try:
            data = json.loads(cm.extract_closure(response.body[mt.end():], "{", "}")[0].replace(r'\"',
                                                                                                '"').replace(r"\'", "'"))
        except(TypeError, IndexError, ValueError):
            return
        if 'style' not in data:
            return
        metadata['model'] = data['style']
        if 'productName' in data:
            metadata['name'] = self.reformat(data['productName'])

        try:
            metadata['color'] = [self.reformat(swatch['color']).lower() for swatch in data['swatchGroup']['swatches']
                                 if 'color' in swatch]
        except KeyError:
            pass

        # 价格信息
        try:
            for item in data['swatchGroup']['swatches']:
                if 'listPrice' in item:
                    metadata['price'] = self.reformat(item['listPrice'])
                    if 'unitPrice' in item:
                        metadata['price_discount'] = self.reformat(item['unitPrice'])
                    break
        except KeyError:
            pass

        # 图像链接
        image_urls = []
        try:
            image_host = 'http://s7d2.scene7.com/is/image/Coach/{0}{1}'
            style_for_images = data['styleForImages']
            for item in data['swatchGroup']['swatches']:
                for subimg in ('aImages', 'nImages', 'mImages'):
                    for tmp in [val['imageName'] for val in item[subimg]]:
                        if tmp not in image_urls:
                            image_urls.append(tmp)
            image_urls = [str.format(image_host, style_for_images, val) for val in image_urls]
        except KeyError:
            pass

        item = ProductItem()
        item['image_urls'] = image_urls
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['metadata'] = metadata
        return item
Ejemplo n.º 44
0
    def parse_details(self, response):
        metadata = response.meta['userdata']
        metadata['url'] = response.url
        sel = Selector(response)

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        # image_urls = sel.xpath('//div[@id="itemContent"]//img/@src').extract()

        # 获得图片
        hdr = None
        tail = None
        img0 = sel.xpath(
            '//meta[@property="og:image" and @content]/@content').extract()
        if img0:
            img0 = img0[0]
            mt = re.search(r'(.+)_\d+_\w(\..+)$', img0)
            if mt:
                hdr = mt.group(1)
                tail = mt.group(2)
        idx = response.body.find('jsinit_item')
        img_item = None
        if idx != -1:
            tmp = response.body[idx:]
            idx = tmp.find('ALTERNATE')
            if idx != -1:
                try:
                    img_item = json.loads(
                        cm.extract_closure(tmp[idx:], r'\[', r'\]')[0])
                except ValueError:
                    pass
        image_urls = []
        if hdr and tail and img_item:
            for item in img_item:
                mt = re.search(r'(\d+)_\w', item)
                if not mt:
                    continue
                start_idx = int(mt.group(1))
                for idx in xrange(start_idx, 15):
                    tmp = re.sub(r'\d+_(\w)', str.format(r'{0}_\1', idx), item)
                    image_urls.append(str.format('{0}_{1}{2}', hdr, tmp, tail))

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        item['image_urls'] = image_urls
        item['metadata'] = metadata
        yield item
Ejemplo n.º 45
0
def fetch_stores(data):
    print '(%s/%d) Found city: %s' % (data['brandname_e'], data['brand_id'],
                                      data['city_e'])
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    # 处理重定向
    m = re.search('<h2>Object moved to <a href="(.+?)">', html)
    if m is not None:
        data['url'] = data['host'] + m.group(1)
        return fetch_countries(data)

    m = re.search('var\s+data\s*=\s*', html)
    if m is None:
        return []
    sub, start, end = cm.extract_closure(html[m.end():], r'\[', r'\]')
    if end == 0:
        return []

    store_list = []
    for s in json.loads(sub):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        name = s['Name']
        if cm.is_chinese(name):
            entry[cm.name_c] = name
        else:
            entry[cm.name_e] = name
        entry[cm.addr_e] = cm.html2plain(s['Street'])
        entry[cm.city_e] = cm.extract_city(data['city_e'])[0]
        entry[cm.country_e] = data['country_e']
        entry[cm.province_e] = data['province_e']
        pat = re.compile(ur'tel[\.: ]*', re.I)
        entry[cm.tel] = re.sub(pat, '', s['Phone']).strip()
        pat = re.compile(ur'fax[\.: ]*', re.I)
        entry[cm.fax] = re.sub(pat, '', s['Fax']).strip()
        entry[cm.email] = s['Email'].strip()
        entry[cm.url] = s['Website'].strip()
        coord = s['LatLng']
        if coord is not None and len(coord) >= 2:
            if coord[0] is not None:
                entry[cm.lat] = string.atof(coord[0])
            if coord[1] is not None:
                entry[cm.lng] = string.atof(coord[1])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s/%d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')

    return store_list
Ejemplo n.º 46
0
    def fetch_price(cls, response, spider=None):
        sel = Selector(response)
        ret = {}

        response.meta['url'] = response.url

        if 'userdata' in response.meta:
            region = response.meta['userdata']['region']
        else:
            region = response.meta['region']

        region_code = '|'.join(cls.spider_data['base_url'][reg]
                               for reg in cls.get_supported_regions())
        watch_code = []
        for r in cls.get_supported_regions():
            if r in cls.spider_data['watch_term']:
                watch_code.extend(cls.spider_data['watch_term'][r])
        watch_code = '|'.join(watch_code)

        old_price = None
        new_price = None

        mt = re.search(
            unicode.format(ur'chanel\.com/({0})/({1})/.+', region_code,
                           watch_code), response.url)

        if mt:  # 对应 parse_watch
            price_url = str.format(
                'http://www-cn.chanel.com/{0}/{1}/collection_product_detail?product_id={2}&maj=price',
                cls.spider_data['base_url'][region],
                cls.spider_data['watch_term'][region][0],
                cls.fetch_model(response))

            return ProxiedRequest(url=price_url,
                                  callback=cls.fetch_price_request_watch,
                                  errback=spider.onerror,
                                  meta=response.meta,
                                  proxy_enabled=True,
                                  proxy_region=region)
        else:
            mt = re.search(
                str.format(r'chanel\.com/({0})/.+\?sku=\d+$', region_code),
                response.url)
            if mt:  # 对应 parse_sku1
                # TODO 这种类型url找不到原来取价格的代码
                pass
            else:
                mt = re.search(
                    str.format(r'chanel\.com/({0})/.+/sku/\d+$', region_code),
                    response.url)
                if mt:  # 对应 parse_sku2
                    temp = sel.xpath(
                        '//div[contains(@class, "product_detail_container")]')
                    if len(temp) > 0:
                        product_name = temp[0]
                        temp = product_name.xpath(
                            './/h3[@class="product_price"]')
                        if len(temp) > 0:
                            old_price = unicodify(temp[0]._root.text)
                else:
                    mt = re.search(
                        str.format(r'chanel\.com/({0})/.+(?<=/)s\.[^/]+\.html',
                                   region_code), response.url)
                    if mt:
                        mt = re.search(r'var\s+settings', response.body)
                        content = cm.extract_closure(
                            response.body[mt.start():], '{', '}')[0]
                        try:
                            data = json.loads(content)
                            if 'detailsGridJsonUrl' in data['sectionCache']:
                                temp = data['sectionCache'][
                                    'detailsGridJsonUrl']
                                if re.search(r'^http://', temp):
                                    url = temp
                                else:
                                    url = str.format(
                                        '{0}{1}',
                                        cls.spider_data['hosts'][region], temp)
                                return ProxiedRequest(
                                    url=url,
                                    meta=response.meta,
                                    callback=cls.
                                    fetch_price_request_fashion_json,
                                    proxy_enabled=True,
                                    proxy_region=region,
                                    dont_filter=True,
                                    errback=spider.onerror)
                            else:
                                return cls.fetch_price_request_fashion(
                                    response.meta, data['sectionCache'],
                                    spider)
                        except (KeyError, TypeError, IndexError):
                            pass
                    else:
                        pass

        if old_price:
            ret['price'] = old_price
        if new_price:
            ret['price_discount'] = new_price

        return ret
Ejemplo n.º 47
0
    def parse_details(self, response):
        def func(product_id):
            m = copy.deepcopy(metadata)

            # if product_id in data['simpleProductPrices']:
            #     m['price'] = data['simpleProductPrices'][product_id]

            image_url = data['baseImages'][product_id]
            # 尝试找到zoom图
            zoom_image_url = re.sub(r'/default/([^/]+)$', r'/zoom/\1',
                                    image_url)
            if zoom_image_url in unicodify(response.body):
                image_url = zoom_image_url
            elif zoom_image_url.replace('/',
                                        r'\/') in unicodify(response.body):
                image_url = zoom_image_url

            # m['description'] = self.reformat(data['descriptions'][product_id])
            # m['name'] = self.reformat(data['names'][product_id])
            # m['model'] = data['skus'][product_id]
            # # TODO 这里有可能导致网页的url找错,例如:http://usa.hermes.com/jewelry/gold-jewelry/bracelets/configurable-product-104820b-23578.html
            # if product_id in data['links']:
            #     m['url'] = data['links'][product_id]
            # else:
            #     m['url'] = response.url
            #
            for attrib in data['attributes']:
                attrib_name = attrib['code']
                #     if re.search(r'color[\b_]', attrib_name):
                #         attrib_name = 'color'
                #     elif re.search('size_sized', attrib_name):
                #         attrib_name = 'size'

                temp = [
                    unicodify(val['label']).lower()
                    for val in attrib['options']
                    if product_id in val['products']
                ]
                # if attrib_name == 'color':
                #     m['color'] = temp
                # else:
                #     m['tags_mapping'][unicodify(attrib_name).lower()] = \
                #         [{'name': val.lower(), 'title': val} for val in temp]
                if attrib_name != 'color':
                    m['tags_mapping'][unicodify(attrib_name).lower()] = \
                        [{'name': val.lower(), 'title': val} for val in temp]

            # if 'category-1' in m['tags_mapping']:
            #     m['category'] = [val['name'] for val in m['tags_mapping']['category-1']]

            item = ProductItem()
            item['image_urls'] = [image_url]
            item['url'] = m['url']
            item['model'] = m['model']
            item['metadata'] = m
            return item

        metadata = response.meta['userdata']

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        idx = response.body.find('spConfig.init')
        if idx == -1:
            idx = response.body.find('ConfProduct.init')
            if idx == -1:
                return

        body = cm.extract_closure(response.body[idx:], '{', '}')[0]
        data = json.loads(body)
        for val in (func(product_id) for product_id in data['productIds']):
            yield val
Ejemplo n.º 48
0
    def parse_product(self, response):

        metadata = response.meta['userdata']
        sel = Selector(response)

        # 这里不进入其他页面,因为后边找图片的方法,可以把所有颜色的图片找全
        # # 其他颜色页面
        # color_href_nodes = sel.xpath('//div[@class="variationattributes"]/div[@class="swatches color"]/ul/li/a[@href]')
        # for node in color_href_nodes:
        #     m = copy.deepcopy(metadata)
        #
        #     href = node.xpath('./@href').extract()[0]
        #     href = self.process_href(href, response.url)
        #
        #     Request(url=href,
        #             callback=self.parse_product,
        #             errback=self.onerr,
        #             meta={'userdata': m})

        metadata['url'] = response.url

        model = self.fetch_model(response)
        if model:
            metadata['model'] = model
        else:
            return

        ret = self.fetch_price(response)
        if 'price' in ret:
            metadata['price'] = ret['price']
        if 'price_discount' in ret:
            metadata['price_discount'] = ret['price_discount']

        name = self.fetch_name(response)
        if name:
            metadata['name'] = name

        colors = self.fetch_color(response)
        if colors:
            metadata['color'] = colors

        description = self.fetch_description(response)
        if description:
            metadata['description'] = description

        detail = self.fetch_details(response)
        if detail:
            metadata['details'] = detail

        image_urls = []
        try:
            start = 0
            while 1:
                mt = re.search(r'xlarge:', response.body[start:])
                if mt:
                    result = common.extract_closure(response.body[mt.start():],
                                                    '\[', '\]')
                    content = result[0]
                    start = result[2]
                    if 0 == start:
                        break
                    url_list = re.findall('"url":.*\'(.+)\?.*\'', content)
                    for url in url_list:
                        image_urls += [self.process_href(url, response.url)]
                else:
                    break
        except (TypeError, IndexError):
            pass

        item = ProductItem()
        item['url'] = metadata['url']
        item['model'] = metadata['model']
        if image_urls:
            item['image_urls'] = image_urls
        item['metadata'] = metadata

        yield item