Esempio n. 1
0
def fetch_stores(db, data, logger):
    """
    获得商店信息
    :param data:
    """
    url = data['post_url']
    try:
        html = cm.post_data(url, {
            'pid': data['city_id'],
            'lang': 'en',
            'action': 'popola_box_DX'
        })
        if html.strip() == u'':
            logger.error(
                unicode.format(u'Failed to fetch stores for city {0}',
                               data['city_id']))
            return []
        body = pq(html)
    except Exception as e:
        print 'Error occured in getting city list: %s' % url
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for item in (pq(temp) for temp in body('a[href]')):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.url] = item[0].attrib['href']
        entry[cm.name_e] = item('h3.titleShop')[0].text.strip()

        # terms = cm.reformat_addr(item('div.txtBoxSingleStore p.lineHeight14')[0].text).split(',')
        terms = cm.reformat_addr(
            unicode(item('div.txtBoxSingleStore p.lineHeight14'))).split(',')
        tel = cm.extract_tel(terms[-1])
        if tel != '':
            terms = terms[:-1]
            entry[cm.tel] = tel
        entry[cm.addr_e] = u', '.join([v.strip() for v in terms])
        entry['country_e'] = data['country_e']
        entry['city_e'] = data['city_e']
        gs.field_sense(entry)

        logger.info(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))
        store_list.append(entry)
        cm.insert_record(db, entry, 'spider_stores.stores')

    return store_list
Esempio n. 2
0
    def func(s):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.native_id] = int(s['id'])
        if entry[cm.native_id] in data['store_list']:
            return

        entry[cm.lat] = float(s['latitude'])
        entry[cm.lng] = float(s['longitude'])
        entry[cm.email] = s['email']
        entry[cm.fax] = s['fax']
        entry[cm.store_class] = ' | '.join((str.format('ISCHANEL:{0}', 'YES' if s['ischanel'] != 0 else 'NO'),
                                            s['postypename']))
        try:
            entry[cm.hours] = ' | '.join(
                map(lambda val: ':'.join((val['day'], val['opening'] if 'opening' in val else '')), s['openinghours']))
        except TypeError as e:
            pass
        entry[cm.tel] = s['phone']

        trans = s['translations'][0]
        entry[cm.addr_e] = cm.html2plain(
            ', '.join(filter(lambda val: val, (trans[key] for key in ('address1', 'address2')))))
        entry[cm.city_e] = cm.html2plain(trans['cityname'].strip().upper())
        entry[cm.name_e] = cm.html2plain(trans['name'])
        entry[cm.province_e] = cm.html2plain(trans['statename']).strip().upper()
        entry[cm.store_type] = ', '.join(temp['name'] for temp in trans['products'])
        entry[cm.url] = s['website']
        entry[cm.zip_code] = s['zipcode']

        country_id = s['country_id'] if 'country_id' in s else None
        if country_id and country_id in data['country_map']:
            entry[cm.country_e] = data['country_map'][country_id]
        else:
            ret = gs.geocode2(latlng=str.format('{0},{1}', entry[cm.lat], entry[cm.lng]), logger=logger)
            country_e = None
            if len(ret) > 0:
                for item in ret[0]['address_components']:
                    if 'country' in item['types']:
                        country_e = item['long_name'].strip().upper()
                        break
            if not country_e:
                country_e = raw_input(unicode.format(u'INPUT THE COUNTRY NAME FOR {0} AT {1}, {2}',
                                                     entry[cm.city_e], entry[cm.lat], entry[cm.lng])).decode('utf-8')
            if not country_e:
                # 无法确定国家名称,放弃该记录
                return
            entry[cm.country_e] = country_e
            if country_id:
                data['country_map'][country_id] = country_e

        logger.info(('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                 entry[cm.name_e], entry[cm.addr_e],
                                                                 entry[cm.country_e],
                                                                 entry[cm.continent_e])))
        cm.insert_record(db, entry, data['table'])

        data['store_list'].add(entry[cm.native_id])
Esempio n. 3
0
    def func(item):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip()
        addr_sub = unicode(pq(item('p')[0]))
        addr_list = [
            term.strip() for term in cm.reformat_addr(addr_sub).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

        temp = item('a.track_map[href]')
        m = hashlib.md5()
        m.update(url)
        if len(temp) > 0:
            map_ref = temp[0].attrib['href']
            m.update(map_ref)
            m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref))
            if m_query:
                query_parm = m_query.group(1).replace('+', ' ')
                entry['geo_query_param'] = query_parm

        else:
            m.update(entry[cm.addr_e])
        fingerprint = m.hexdigest()
        entry[cm.native_id] = fingerprint
        if entry[cm.native_id] in data['store_list']:
            return

        entry[cm.country_e] = data['country']
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        logger.info(
            ('(%s / %d) Found store: %s, %s (%s, %s)' %
             (data['brandname_e'], data['brand_id'], entry[cm.name_e],
              entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])))
        cm.insert_record(db, entry, data['table'])
        return entry
Esempio n. 4
0
def fetch_stores(db, data, logger):
    q = pq(url='http://www.paulandjoe.com/en/ozcms/stores/list/?country_id=&postcode=')

    store_list = []

    # Country
    country_a = q('#store_list>li>a')
    country_b = q('#store_list>li>ul')
    assert (len(country_a) == len(country_b))
    for i in xrange(len(country_a)):
        country = country_a[i].text.strip().upper()
        store_a = pq(country_b[i])('a.marker-store')
        store_b = pq(country_b[i])('span.store-infos')
        assert (len(store_a) == len(store_b))
        for j in xrange(len(store_a)):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

            lat = store_a[j].attrib['data-latitude']
            lat = float(lat) if lat else None
            lng = store_a[j].attrib['data-longitude']
            lng = float(lng) if lng else None
            if lat and lng:
                entry[cm.lat], entry[cm.lng] = lat, lng

            entry[cm.name_e] = store_a[j].text
            entry[cm.addr_e] = cm.reformat_addr(str(pq(store_b[j])))  # cm.reformat_addr(str(store_b[j]))
            entry[cm.country_e] = country

            gs.field_sense(entry)
            ret = gs.addr_sense(entry[cm.addr_e])
            if ret[0] is not None and entry[cm.country_e] == '':
                entry[cm.country_e] = ret[0]
            if ret[1] is not None and entry[cm.province_e] == '':
                entry[cm.province_e] = ret[1]
            if ret[2] is not None and entry[cm.city_e] == '':
                entry[cm.city_e] = ret[2]
            gs.field_sense(entry)
            logger.info('(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                  entry[cm.name_e], entry[cm.addr_e],
                                                                  entry[cm.country_e],
                                                                  entry[cm.continent_e]))
            store_list.append(entry)
            cm.insert_record(db, entry, 'spider_stores.stores')

    return tuple(store_list)
Esempio n. 5
0
def fetch_stores(db, data, logger):
    brand_id, brand_name, url = (data[key]
                                 for key in ('brand_id', 'brandname_c', 'url'))

    # try:
    body = cm.get_data(url)
    q = pq(body)
    # except Exception, e:
    #     logger.error(unicode.format(u'Error in fetching contents for {0}', url))
    #     return ()

    m1 = re.search(ur'var\s+markers\s*=\s*\[', body)
    if not m1:
        logger.error(
            unicode.format(u'Error in finding stores for {0}:{1}', brand_id,
                           brand_name))
        return ()

    body = body[m1.end() - 1:]
    m2 = re.search(ur'\]\s*;', body)
    if not m2:
        logger.error(
            unicode.format(u'Error in finding stores for {0}:{1}', brand_id,
                           brand_name))
        return ()
    raw = json.loads(body[:m2.end() - 1])

    store_list = []
    for s in raw:
        entry = cm.init_store_entry(brand_id, brand_name, data['brandname_c'])
        # try:
        try:
            entry[cm.lat], entry[cm.lng] = (float(s['location'][idx])
                                            for idx in (0, 1))
        except (KeyError, IndexError, ValueError, TypeError):
            pass

        s = s['content']
        try:
            entry[cm.name_e] = cm.html2plain(s['title']).strip()
        except (KeyError, TypeError):
            pass

        tmp_list = s['analytics_label'].split('-')
        entry[cm.country_e] = tmp_list[0]
        entry[cm.city_e] = cm.extract_city(tmp_list[1])[0]

        try:
            entry[cm.addr_e] = cm.reformat_addr(s['address']).strip()
        except (KeyError, TypeError):
            pass

        try:
            entry[cm.fax] = s['fax'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.tel] = s['phone'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.email] = s['mail'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[
                cm.
                url] = u'http://en.longchamp.com/store/map' + s['url'].strip()
        except (KeyError, TypeError):
            pass
        try:
            entry[cm.zip_code] = cm.html2plain(s['zipcode_town']).replace(
                tmp_list[1], '').strip()
        except (KeyError, TypeError):
            pass

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        logger.info(
            unicode.format(
                u'{0}:{1} FOUND STORE: {2}, {3}, ({4}, {5}, {6})',
                data['brand_id'], data['brandname_e'],
                *(entry[key] for key in (cm.name_e, cm.addr_e, cm.city_e,
                                         cm.country_e, cm.continent_e))))

        cm.insert_record(db, entry, 'spider_stores.stores')
        store_list.append(entry)

    return tuple(store_list)
Esempio n. 6
0
                entry[cm.fax] = re.sub(pat_fax, '', term)
            elif re.search(pat_email, term):
                entry[cm.email] = re.sub(pat_email, '', term)

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    logger.info('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]))
    cm.insert_record(db, entry, data['table'])

    return ()


def fetch_cities(db, data, logger):
    url = data['url']
    param = {'IsFooterForm': 'true', 'CurrentCountryID': data['country_code']}
    if data['state_code']:
        param['CurrentRegionID'] = data['state_code']
    try:
        body, data['cookie'] = cm.get_data_cookie(url, param, cookie=data['cookie'])
        q = pq(body)
    except Exception, e:
        # cm.dump('Error in fetching cities: %s, %s' % (url, param), log_name)
        return ()
Esempio n. 7
0
            entry[cm.hours] = ', '.join(hours_list)

            gs.field_sense(entry)
            if entry[cm.addr_e]:
                ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)

            logger.info('(%s / %d) Found store: %s, %s (%s, %s, %s)' %
                        (data['brandname_e'], data['brand_id'],
                         entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                         entry[cm.country_e], entry[cm.continent_e]))
            cm.insert_record(db, entry, 'spider_stores.stores')
            store_list.append(entry)

        except (IndexError, TypeError) as e:
            logger.error(traceback.format_exc())
            continue

    return tuple(store_list)


def fetch(db, data=None, user='******', passwd=''):
    logging.config.fileConfig('ca.cfg')
    logger = logging.getLogger('firenzeLogger')
    logger.info(u'ca STARTED')

    def func(data, level):
Esempio n. 8
0
def fetch_store_details(db, data, logger):
    url = data['store_url']
    try:
        body = pq(cm.get_data(url))
    except Exception:
        logger.error('Error in fetching store details: %s' % url)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])

    entry[cm.addr_e] = cm.reformat_addr(unicode(body('p.address')))
    entry[cm.store_type] = ', '.join(temp.text.strip()
                                     for temp in body('li.availability li'))

    raw = json.loads(body('div.gmap_info_box')[0].attrib['data-shop'])['table']

    # start = body.find(ur'<h3>available in store</h3>')
    # if start != -1:
    #     type_sub = cm.extract_closure(body[start:], ur'<ul\b', ur'</ul>')[0]
    #     entry[cm.store_type] = ', '.join(
    #         cm.html2plain(tmp).strip() for tmp in re.findall(ur'<li[^<>]*>(.+?)</li>', type_sub, re.S))
    #
    # start = body.find(ur"<div class='gmap_info_box'")
    # if start == -1:
    #     logger.error('Error in fetching store details: %s' % url)
    #     return []
    # body = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]

    # raw = json.loads(cm.extract_closure(body, ur'\{', ur'\}')[0])['table']
    entry[cm.name_e] = cm.html2plain(raw['name'])
    entry[cm.city_e] = data['city'].strip().upper()
    entry[cm.country_e] = data['country'].strip().upper()
    # entry[cm.store_type] = data['store_type']
    entry[cm.addr_e] = cm.reformat_addr(raw['address'])
    m = re.search(re.compile(ur'phone:(.*?)fax:(.*?)', re.I | re.S),
                  raw['phone'])
    if m is not None:
        entry[cm.tel] = m.group(1).strip()
        entry[cm.fax] = m.group(2).strip()
    else:
        m = re.search(re.compile(ur'phone:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.tel] = m.group(1).strip()
        m = re.search(re.compile(ur'fax:(.*?)', re.I | re.S), raw['phone'])
        if m is not None:
            entry[cm.fax] = m.group(1).strip()
    entry[cm.hours] = raw['hours']
    if raw['lat'] is not None and raw['lat'] != '':
        entry[cm.lat] = string.atof(raw['lat'])
    if raw['lng'] is not None and raw['lng'] != '':
        entry[cm.lng] = string.atof(raw['lng'])
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None:
        entry[cm.province_e] = ret[1]
        gs.field_sense(entry)

    logger.info('(%s / %d) Found store: %s, %s (%s, %s)' %
                (data['brandname_e'], data['brand_id'], entry[cm.name_e],
                 entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))
    cm.insert_record(db, entry, 'spider_stores.stores')
    return [entry]