예제 #1
0
파일: mido.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    """
    商店列表
    :param data:
    """
    html = data['html']

    store_list = []
    while True:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        m = re.search(ur'<li class="leaf end"><div><u>(.+?)</u>', html)
        if m is None:
            break
        html = html[m.start():]
        entry[cm.name_e] = m.group(1)

        sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>')
        html = html[end:]

        # 单个商店的页面
        sub = sub[len(m.group(0)):-len('</li>')]

        m = re.search(ur'<a href="(http.+?)"', sub)
        if m is not None:
            entry[cm.url] = m.group(1)
        m = re.search(ur'<a href="mailto:(.+?)"', sub)
        if m is not None:
            entry[cm.email] = m.group(1)
        m = re.search(ur'(?:<a\b|</div>)', sub)
        if m is not None:
            addr = sub[:m.start()]
        else:
            addr = sub
            # 解析地址栏
        addr = cm.reformat_addr(addr)
        terms = addr.split(',')
        new_terms = []
        for t in terms:
            if re.search(ur'phone', t, re.IGNORECASE) is not None:
                entry[cm.tel] = cm.extract_tel(t)
            elif re.search(ur'fax', t, re.IGNORECASE) is not None:
                entry[cm.fax] = cm.extract_tel(t)
            elif data['city_e'] in t.strip().upper():
                # 邮编
                m = re.search(ur'\d+', t)
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
예제 #2
0
파일: kipling.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.get_data(url, {
            'country': data['country'],
            'city': data['city']
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []

    for item in raw['items']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country'].strip().upper()
        tmp = cm.extract_city(data['city'])[0]
        if entry[cm.country_e] == 'USA':
            entry[cm.province_e] = tmp
        else:
            entry[cm.city_e] = tmp
        gs.field_sense(entry)

        addr = cm.reformat_addr(item['address'].replace(u'\\', ''))
        addr_list = [tmp.strip() for tmp in addr.split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.store_type] = item['shop_type']

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
예제 #3
0
파일: dunhill.py 프로젝트: haizi-zh/firenze
    def get_detailed_store(html, store_cat):
        store_list = []
        start = 0
        while True:
            sub_html, start, end = common.extract_closure(html, ur"<li\b", ur"</li>")
            if end == 0:
                break

            # 得到单个门店的页面代码
            html = html[end:]
            entry = common.init_store_entry(brand_id, brandname_e, brandname_c)

            m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html)
            if len(m) > 0:
                entry[common.name_e] = common.reformat_addr(m[0])
            m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S)
            if len(m) > 0:
                addr = common.reformat_addr(m[0])
                # 最后一行是否为电话号码?
                terms = addr.split(", ")
                tel = common.extract_tel(terms[-1])
                if tel != "":
                    addr = ", ".join(terms[:-1])
                    entry[common.tel] = tel
                entry[common.addr_e] = addr

            # 获得门店类型
            # store_type = [store_cat]
            type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">', ur"</ul>")
            if type_end != 0:
                store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>', type_html)]
                store_type.insert(0, store_cat)
                entry[common.store_type] = ", ".join(store_type)
            else:
                entry[common.store_type] = store_cat

            # 获得经纬度
            m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lat] = string.atof(m[0])
            m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lng] = string.atof(m[0])

            entry[common.city_e] = common.extract_city(data[common.city_e])[0]
            entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper()
            gs.field_sense(entry)

            print "%s: Found store: %s, %s (%s, %s, %s)" % (
                brandname_e,
                entry[common.name_e],
                entry[common.addr_e],
                entry[common.city_e],
                entry[common.country_e],
                entry[common.continent_e],
            )
            db.insert_record(entry, "stores")
            store_list.append(entry)

        return store_list
예제 #4
0
파일: mido.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    """
    商店列表
    :param data:
    """
    html = data['html']

    store_list = []
    while True:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m = re.search(ur'<li class="leaf end"><div><u>(.+?)</u>', html)
        if m is None:
            break
        html = html[m.start():]
        entry[cm.name_e] = m.group(1)

        sub, start, end = cm.extract_closure(html, ur'<li\b', '</li>')
        html = html[end:]

        # 单个商店的页面
        sub = sub[len(m.group(0)):-len('</li>')]

        m = re.search(ur'<a href="(http.+?)"', sub)
        if m is not None:
            entry[cm.url] = m.group(1)
        m = re.search(ur'<a href="mailto:(.+?)"', sub)
        if m is not None:
            entry[cm.email] = m.group(1)
        m = re.search(ur'(?:<a\b|</div>)', sub)
        if m is not None:
            addr = sub[:m.start()]
        else:
            addr = sub
            # 解析地址栏
        addr = cm.reformat_addr(addr)
        terms = addr.split(',')
        new_terms = []
        for t in terms:
            if re.search(ur'phone', t, re.IGNORECASE) is not None:
                entry[cm.tel] = cm.extract_tel(t)
            elif re.search(ur'fax', t, re.IGNORECASE) is not None:
                entry[cm.fax] = cm.extract_tel(t)
            elif data['city_e'] in t.strip().upper():
                # 邮编
                m = re.search(ur'\d+', t)
                if m is not None:
                    entry[cm.zip_code] = m.group(0)
예제 #5
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 1, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)', html)
    if len(m) > 0:
        cm.update_entry(entry, {cm.lat: string.atof(m[0][0]), cm.lng: string.atof(m[0][1])})

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list=[tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
        entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
예제 #6
0
파일: sergio.py 프로젝트: haizi-zh/ofashion
def fetch_stores(db, data, logger):
    """
    获得商店信息
    :param data:
    """
    url = data['post_url']
    try:
        html = cm.post_data(url, {
            'pid': data['city_id'],
            'lang': 'en',
            'action': 'popola_box_DX'
        })
        if html.strip() == u'':
            logger.error(
                unicode.format(u'Failed to fetch stores for city {0}',
                               data['city_id']))
            return []
        body = pq(html)
    except Exception as e:
        print 'Error occured in getting city list: %s' % url
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for item in (pq(temp) for temp in body('a[href]')):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.url] = item[0].attrib['href']
        entry[cm.name_e] = item('h3.titleShop')[0].text.strip()

        # terms = cm.reformat_addr(item('div.txtBoxSingleStore p.lineHeight14')[0].text).split(',')
        terms = cm.reformat_addr(
            unicode(item('div.txtBoxSingleStore p.lineHeight14'))).split(',')
        tel = cm.extract_tel(terms[-1])
        if tel != '':
            terms = terms[:-1]
            entry[cm.tel] = tel
        entry[cm.addr_e] = u', '.join([v.strip() for v in terms])
        entry['country_e'] = data['country_e']
        entry['city_e'] = data['city_e']
        gs.field_sense(entry)

        logger.info(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]))
        store_list.append(entry)
        cm.insert_record(db, entry, 'spider_stores.stores')

    return store_list
예제 #7
0
def fetch_uk(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print "Error in finding %s stores" % data["name"]
        return []
    body, start, end = cm.extract_closure(body[start:], ur"<div\b", ur"</div>")
    if end == 0:
        print "Error in finding %s stores" % data["name"]
        return []

    store_list = []
    for m in re.findall(ur"<div>\s*(.+?)\s*</div>", body, re.S):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.country_e] = data["name"]

        addr_list = re.findall(ur"<p>\s*(.+?)\s*</p>", m)
        tel = cm.extract_tel(addr_list[-1])
        if tel != "":
            entry[cm.tel] = tel
            del addr_list[-1]

        if data["name"] == "AUSTRALIA":
            country, province, city = gs.addr_sense(", ".join(addr_list), data["name"])
            if city is not None:
                entry[cm.city_e] = city
            if province is not None:
                entry[cm.province_e] = province
        else:
            city = addr_list[-2].strip().upper()
            entry[cm.city_e] = city
            ret = gs.look_up(city, 3)
            if ret is not None and ret["country"]["name_e"] == gs.look_up("UK", 1)["name_e"]:
                entry[cm.city_e] = ret["name_e"]
            entry[cm.zip_code] = addr_list[-1].strip().upper()
        entry[cm.addr_e] = ", ".join(addr_list)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print "(%s / %d) Found store: %s, %s (%s, %s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.city_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )

        db.insert_record(entry, "stores")
        store_list.append(entry)
예제 #8
0
    def get_detailed_store(html, store_cat):
        store_list = []
        start = 0
        while True:
            sub_html, start, end = common.extract_closure(html, ur'<li\b', ur'</li>')
            if end == 0:
                break

            # 得到单个门店的页面代码
            html = html[end:]
            entry = common.init_store_entry(brand_id, brandname_e, brandname_c)

            m = re.findall(ur'<div class="store-title -h3a">(.+?)</div>', sub_html)
            if len(m) > 0:
                entry[common.name_e] = common.reformat_addr(m[0])
            m = re.findall(ur'<div class="store-address">(.+?)</div>', sub_html, re.S)
            if len(m) > 0:
                addr = common.reformat_addr(m[0])
                # 最后一行是否为电话号码?
                terms = addr.split(', ')
                tel = common.extract_tel(terms[-1])
                if tel != '':
                    addr = ', '.join(terms[:-1])
                    entry[common.tel] = tel
                entry[common.addr_e] = addr

            # 获得门店类型
            # store_type = [store_cat]
            type_html, type_start, type_end = common.extract_closure(sub_html, ur'<ul class="service-list">',
                                                                     ur'</ul>')
            if type_end != 0:
                store_type = [m for m in re.findall(ur'<li class="service-item">(.+?)</li>',
                                                    type_html)]
                store_type.insert(0, store_cat)
                entry[common.store_type] = ', '.join(store_type)
            else:
                entry[common.store_type] = store_cat

            # 获得经纬度
            m = re.findall(ur'data-latitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lat] = string.atof(m[0])
            m = re.findall(ur'data-longitude="(-?\d+\.\d+)"', sub_html)
            if len(m) > 0:
                entry[common.lng] = string.atof(m[0])

            entry[common.city_e] = common.extract_city(data[common.city_e])[0]
            entry[common.country_e] = common.reformat_addr(data[common.country_e]).strip().upper()
            gs.field_sense(entry)

            print '%s: Found store: %s, %s (%s, %s, %s)' % (
                brandname_e, entry[common.name_e], entry[common.addr_e], entry[common.city_e], entry[common.country_e],
                entry[common.continent_e])
            db.insert_record(entry, 'stores')
            store_list.append(entry)

        return store_list
예제 #9
0
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data['post_url']
    try:
        html = cm.post_data(url, {
            'pid': data['city_id'],
            'lang': 'en',
            'action': 'popola_box_DX'
        })
    except Exception:
        print 'Error occured in getting city list: %s' % url
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<a href="(.+?)".*?>', html):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.url] = m.group(1)
        store_html, start, end = cm.extract_closure(html[m.start():],
                                                    ur'<a href', ur'</a>')
        if end == 0:
            continue
        m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S)
        if len(m1) > 0:
            entry[cm.name_e] = m1[0].strip()
        m1 = re.findall(ur'<p\b.*?>(.+?)(?:</p>|</div>)', store_html, re.S)
        if len(m1) > 0:
            terms = cm.reformat_addr(m1[0]).split(',')
            tel = cm.extract_tel(terms[-1])
            if tel != '':
                terms = terms[:-1]
                entry[cm.tel] = tel
            entry[cm.addr_e] = ', '.join([v.strip() for v in terms])

        entry['country_e'] = data['country_e']
        entry['city_e'] = data['city_e']
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
예제 #10
0
파일: ysl.py 프로젝트: haizi-zh/firenze
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(store_entry,
                        {cm.continent_e: opt[cm.continent_e].strip().upper(), cm.city_e: opt[cm.city_e].strip().upper(),
                         cm.country_e: opt[cm.country_e].strip().upper(),
                         cm.name_e: cm.name_e, cm.addr_e: store_addr, cm.store_type: store_type, cm.hours: store_hour,
                         cm.tel: store_tel})
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e], store_entry[cm.country_e],
            store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
예제 #11
0
def proc_store(sub, data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    entry[cm.country_e] = data['country']
    m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>',
                   sub)
    if m1 is not None:
        entry[cm.store_class] = m1.group(1).strip()

    m1 = re.search(ur'<span itemprop="address"', sub)
    if m1 is not None:
        addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b',
                                      ur'</span>')[0]
        m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>',
                       addr_sub, re.S)
        if m2 is not None:
            entry[cm.zip_code] = m2.group(1).strip()
        m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub,
                       re.S)
        if m2 is not None:
            entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr_sub)

    m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S)
    if m2 is not None:
        entry[cm.tel] = m2.group(1).strip()

    m2 = re.search(ur'Fax\b(.+?)</p>', sub)
    if m2 is not None:
        entry[cm.fax] = cm.extract_tel(m2.group(1))

    m2 = re.search(
        ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>',
        sub)
    if m2 is not None:
        geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8'))
        param = {
            'brepairs': True,
            'restrictedtemplate': 2,
            'bretailers': True,
            'bshops': True,
            'brepairs': True
        }
        try:
            geo_body = cm.get_data(geo_url, param)
            m3 = re.search(
                ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)',
                geo_body)
            if m3 is not None:
                entry[cm.lat] = string.atof(m3.group(1))
                entry[cm.lng] = string.atof(m3.group(2))
        except Exception, e:
            cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param),
                    log_name)
예제 #12
0
def fetch_stores(data):
    url = data['post_shops']
    param = {'city': data['city_e'], 'paulandjoe_women': 0, 'paulandjoe_man': 0,
             'paulandjoe_sister': 0, 'paulandjoe_little': 0, 'paulandjoe_beauty': 0}
    try:
        html = cm.post_data(url, param)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    try:
        for store in (pq(tmp) for tmp in pq(html)('ul')):
            try:
                entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
                entry[cm.name_e] = cm.html2plain(store('li.first')[0].text).strip()
                entry[cm.country_e] = data[cm.country_e]
                entry[cm.city_e] = data[cm.city_e]

                addr_list = []
                for term in (cm.reformat_addr(unicode(pq(tmp))) for tmp in store('li[class!="first"]')):
                    if term != '':
                        addr_list.append(term)
                tel = cm.extract_tel(addr_list[-1])
                if tel != '':
                    entry[cm.tel] = tel
                    del addr_list[-1]
                entry[cm.addr_e] = ', '.join(addr_list)

                gs.field_sense(entry)
                ret = gs.addr_sense(entry[cm.addr_e])
                if ret[0] is not None and entry[cm.country_e] == '':
                    entry[cm.country_e] = ret[0]
                if ret[1] is not None and entry[cm.province_e] == '':
                    entry[cm.province_e] = ret[1]
                if ret[2] is not None and entry[cm.city_e] == '':
                    entry[cm.city_e] = ret[2]
                gs.field_sense(entry)
                print '(%s/%d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e])
                store_list.append(entry)
                db.insert_record(entry, 'stores')
            except (IndexError, TypeError) as e:
                cm.dump(u'Error in parsing %s, %s' % (url, param), log_name)
                print traceback.format_exc()
                continue
    except Exception, e:
        print traceback.format_exc()
예제 #13
0
def fetch_uk(body, data):
    start = body.find(u'<div class="fableft">')
    if start == -1:
        print 'Error in finding %s stores' % data['name']
        return []
    body, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')
    if end == 0:
        print 'Error in finding %s stores' % data['name']
        return []

    store_list = []
    for m in re.findall(ur'<div>\s*(.+?)\s*</div>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['name']

        addr_list = re.findall(ur'<p>\s*(.+?)\s*</p>', m)
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]

        if data['name'] == 'AUSTRALIA':
            country, province, city = gs.addr_sense(', '.join(addr_list), data['name'])
            if city is not None:
                entry[cm.city_e] = city
            if province is not None:
                entry[cm.province_e] = province
        else:
            city = addr_list[-2].strip().upper()
            entry[cm.city_e] = city
            ret = gs.look_up(city, 3)
            if ret is not None and ret['country']['name_e'] == gs.look_up('UK', 1)['name_e']:
                entry[cm.city_e] = ret['name_e']
            entry[cm.zip_code] = addr_list[-1].strip().upper()
        entry[cm.addr_e] = ', '.join(addr_list)
        entry[cm.city_e] = cm.extract_city(entry[cm.city_e])[0]

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.city_e],
                                                              entry[cm.country_e], entry[cm.continent_e])

        db.insert_record(entry, 'stores')
        store_list.append(entry)
예제 #14
0
def fetch_stores(data):
    url = data['store_url']
    param = {'myid': data['key'], 'idioma': 'in'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for s in json.loads(body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        entry[cm.name_e] = cm.reformat_addr(s['title'])

        m = re.search(ur'(.+?)-\s*<', s['key'])
        addr_list = [entry[cm.name_e]]
        if m is not None:
            m1 = re.search(ur'-+', m.group(1))
            if m1 is not None:
                tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]]
            else:
                tmp = [m.group(1)]
            if len(tmp) > 1:
                entry[cm.tel] = cm.extract_tel(tmp[1])
            m1 = re.search(ur'\d{4,}', tmp[0])
            if m1 is not None:
                entry[cm.zip_code] = m1.group()
            addr_list.append(tmp[0].strip())
        entry[cm.addr_e] = ', '.join(addr_list)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
예제 #15
0
def fetch_stores(data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    code = data['country_code']
    if gs.look_up(code, 1) is None:
        entry[cm.country_e] = cm.html2plain(data['country']).strip().upper()
    else:
        entry[cm.country_e] = code
    entry[cm.name_e] = data['store_name']
    entry[cm.city_e] = cm.extract_city(data['city'])[0]
    entry[cm.lat] = data['lat'] if data['lat'] is not None else ''
    entry[cm.lng] = data['lng'] if data['lng'] is not None else ''

    m = re.search(ur'data-boutique\s*=\s*"%s"' % data['store_id'], data['content'])
    sub = data['content'][m.end():]

    m1 = re.search(ur'<li class="isDistributeur[^<>]+>(.+?)</li>', sub)
    if m1 is not None:
        entry[cm.store_class] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="place-title[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m1 = re.search(ur'<li class="contacts[^<>]+>(.+?)</li>', sub, re.S)
    if m1 is not None:
        m2 = re.search(ur'<a class="popupLaunch" href="([^"]+)"', m1.group(1))
        if m2:
            entry = fetch_details(data, m2.group(1), entry)

        m2 = re.search(ur'<p>(.+?)</p>', m1.group(1), re.S)
        if m2:
            ct_list = tuple(tmp.strip() for tmp in cm.reformat_addr(m2.group(1)).split(','))
            entry[cm.tel] = cm.extract_tel(ct_list[0])
            if len(ct_list) > 1:
                entry[cm.email] = ct_list[1].strip()

    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                        entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                        entry[cm.continent_e]), log_name)
    db.insert_record(entry, 'stores')

    return tuple(entry)
예제 #16
0
    def func(item):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.name_e] = cm.html2plain(item('h6')[0].text).strip()
        addr_sub = unicode(pq(item('p')[0]))
        addr_list = [
            term.strip() for term in cm.reformat_addr(addr_sub).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

        temp = item('a.track_map[href]')
        m = hashlib.md5()
        m.update(url)
        if len(temp) > 0:
            map_ref = temp[0].attrib['href']
            m.update(map_ref)
            m_query = re.search(r'q=([^;]+?)&', cm.html2plain(map_ref))
            if m_query:
                query_parm = m_query.group(1).replace('+', ' ')
                entry['geo_query_param'] = query_parm

        else:
            m.update(entry[cm.addr_e])
        fingerprint = m.hexdigest()
        entry[cm.native_id] = fingerprint
        if entry[cm.native_id] in data['store_list']:
            return

        entry[cm.country_e] = data['country']
        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        logger.info(
            ('(%s / %d) Found store: %s, %s (%s, %s)' %
             (data['brandname_e'], data['brand_id'], entry[cm.name_e],
              entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])))
        cm.insert_record(db, entry, data['table'])
        return entry
예제 #17
0
파일: kipling.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = data['store_url']
    try:
        body = cm.get_data(url, {'country': data['country'], 'city': data['city']})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    raw = json.loads(body)
    store_list = []

    for item in raw['items']:
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country'].strip().upper()
        tmp = cm.extract_city(data['city'])[0]
        if entry[cm.country_e] == 'USA':
            entry[cm.province_e] = tmp
        else:
            entry[cm.city_e] = tmp
        gs.field_sense(entry)

        addr = cm.reformat_addr(item['address'].replace(u'\\', ''))
        addr_list = [tmp.strip() for tmp in addr.split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel !='':
            entry[cm.tel]=tel
            del addr_list[-1]
        entry[cm.addr_e]=', '.join(addr_list)
        entry[cm.store_type] = item['shop_type']

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
예제 #18
0
파일: mango.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = data['store_url']
    param = {'myid': data['key'], 'idioma': 'in'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching countries: %s, %s' % (url, param), log_name)
        return []

    store_list = []
    for s in json.loads(body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        entry[cm.name_e] = cm.reformat_addr(s['title'])

        m = re.search(ur'(.+?)-\s*<', s['key'])
        addr_list = [entry[cm.name_e]]
        if m is not None:
            m1 = re.search(ur'-+', m.group(1))
            if m1 is not None:
                tmp = [m.group(1)[:m1.start()], m.group(1)[m1.end():]]
            else:
                tmp = [m.group(1)]
            if len(tmp) > 1:
                entry[cm.tel] = cm.extract_tel(tmp[1])
            m1 = re.search(ur'\d{4,}', tmp[0])
            if m1 is not None:
                entry[cm.zip_code] = m1.group()
            addr_list.append(tmp[0].strip())
        entry[cm.addr_e] = ', '.join(addr_list)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)

    return store_list
예제 #19
0
파일: sergio.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    """
    获得商店信息
    :param data:
    """
    url = data["post_url"]
    try:
        html = cm.post_data(url, {"pid": data["city_id"], "lang": "en", "action": "popola_box_DX"})
    except Exception:
        print "Error occured in getting city list: %s" % url
        dump_data = {"level": 2, "time": cm.format_time(), "data": {"url": url}, "brand_id": data["brand_id"]}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<a href="(.+?)".*?>', html):
        entry = cm.init_store_entry(data["brand_id"], data["brandname_e"], data["brandname_c"])
        entry[cm.url] = m.group(1)
        store_html, start, end = cm.extract_closure(html[m.start() :], ur"<a href", ur"</a>")
        if end == 0:
            continue
        m1 = re.findall(ur'<h3 class="titleShop">(.+?)</h3>', store_html, re.S)
        if len(m1) > 0:
            entry[cm.name_e] = m1[0].strip()
        m1 = re.findall(ur"<p\b.*?>(.+?)(?:</p>|</div>)", store_html, re.S)
        if len(m1) > 0:
            terms = cm.reformat_addr(m1[0]).split(",")
            tel = cm.extract_tel(terms[-1])
            if tel != "":
                terms = terms[:-1]
                entry[cm.tel] = tel
            entry[cm.addr_e] = ", ".join([v.strip() for v in terms])

        entry["country_e"] = data["country_e"]
        entry["city_e"] = data["city_e"]
        gs.field_sense(entry)

        print "(%s / %d) Found store: %s, %s (%s, %s)" % (
            data["brandname_e"],
            data["brand_id"],
            entry[cm.name_e],
            entry[cm.addr_e],
            entry[cm.country_e],
            entry[cm.continent_e],
        )
        store_list.append(entry)
        db.insert_record(entry, "stores")
예제 #20
0
파일: folli.py 프로젝트: haizi-zh/firenze
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.post_data(url, {'cCode': data['country_code'], 'city': data['city'], 'postsearch': 1})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    start = body.find('<div class="store_locator')
    if start == -1:
        print 'Failed processing %s' % url
        return []
    sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S)
    if m is not None:
        addr_list = cm.reformat_addr(m.group(1)).split(', ')
        ret = cm.extract_tel(addr_list[-1])
        if ret != '':
            entry[cm.tel] = ret
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

    addr_text=sub[m.end():]
    m = re.search(ur'<div class="title locator">', addr_text)
    if m is not None:
        tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)', body, re.S)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    entry[cm.country_e] = data['country_code']
    entry[cm.city_e] = data['city']
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                      entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                      entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
예제 #21
0
def fetch_stores(data):
    # country=Greece&city=ATHENS&adutl=+01&kids=+02&undercolor=+06&togetmap=mapdata
    url = data['data_url']
    param = {'country': data['country'], 'city': data['city'], 'adutl': ' 01', 'kids': ' 02',
             'undercolor': ' 06', 'togetmap': 'mapdata'}
    try:
        body = cm.get_data(url, param)
    except Exception:
        cm.dump('Error in fetching stores: %s, %s' % (url, param), 'benetton_log.txt', False)
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.findall(ur'<marker (.+?)>', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        m1 = re.search(ur'name=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1).strip().replace(u'\\', ''))
        m1 = re.search(ur'address=\\"(.+?)\\"', m)
        if m1 is not None:
            addr = cm.reformat_addr(cm.html2plain(m1.group(1)).replace(u'\\', ''))
            tel = cm.extract_tel(addr)
            if tel != '':
                entry[cm.tel] = tel
                addr = addr.replace(tel, '')
            entry[cm.addr_e] = cm.reformat_addr(addr)

        m1 = re.search(ur'lat=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))

        m1 = re.search(ur'lng=\\"(.+?)\\"', m)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data['country'].strip().upper()
        entry[cm.city_e] = cm.extract_city(data['city'])[0]
        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), 'benetton_log.txt', False)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
예제 #22
0
def proc_store(sub, data):
    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.country_e] = data['country']
    m1 = re.search(ur'<strong class="name" itemprop="name">([^<>]+)</strong>', sub)
    if m1 is not None:
        entry[cm.store_class] = m1.group(1).strip()

    m1 = re.search(ur'<span itemprop="address"', sub)
    if m1 is not None:
        addr_sub = cm.extract_closure(sub[m1.start():], ur'<span\b', ur'</span>')[0]
        m2 = re.search(ur'<span itemprop="postal-code">([^<>]+)</span>', addr_sub, re.S)
        if m2 is not None:
            entry[cm.zip_code] = m2.group(1).strip()
        m2 = re.search(ur'<span itemprop="locality">([^<>]+)</span>', addr_sub, re.S)
        if m2 is not None:
            entry[cm.city_e] = cm.html2plain(m2.group(1)).strip().upper()
        entry[cm.addr_e] = cm.reformat_addr(addr_sub)

    m2 = re.search(ur'<span itemprop="tel">([^<>]+)</span>', sub, re.S)
    if m2 is not None:
        entry[cm.tel] = m2.group(1).strip()

    m2 = re.search(ur'Fax\b(.+?)</p>', sub)
    if m2 is not None:
        entry[cm.fax] = cm.extract_tel(m2.group(1))

    m2 = re.search(ur'<a href="([^"]+)"[^<>]+itemprop="url"\s*>\s*Find on a map\s*</a>', sub)
    if m2 is not None:
        geo_url = data['host'] + urllib.quote(m2.group(1).encode('utf-8'))
        param = {'brepairs': True, 'restrictedtemplate': 2, 'bretailers': True, 'bshops': True, 'brepairs': True}
        try:
            geo_body = cm.get_data(geo_url, param)
            m3 = re.search(ur'maps\.google\.com/maps\?daddr\s*=\s*(-?\d+\.\d+)\s*,\s*(-?\d+\.\d+)', geo_body)
            if m3 is not None:
                entry[cm.lat] = string.atof(m3.group(1))
                entry[cm.lng] = string.atof(m3.group(2))
        except Exception, e:
            cm.dump('Error in fetching geo info: %s, %s' % (geo_url, param), log_name)
예제 #23
0
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for s in re.findall(ur'<div class="store_wrapper">(.+?)</div>', body, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']

        m = re.search(ur'<h2>(.+?)</h2>', s)
        if m is not None:
            entry[cm.name_e] = cm.html2plain(m.group(1))

        m = re.search(ur'<p>(.+?)</p>', s, re.S)
        if m is not None:
            addr_list = [tmp.strip() for tmp in cm.reformat_addr(m.group(1)).split(',')]
            tel = cm.extract_tel(re.sub(re.compile('^\s*t\s*(\.|:)\s*', re.I), '', addr_list[-1]))
            if tel != '':
                if entry[cm.country_e] == 'CHINA':
                    if len(re.findall(r'\d', tel)) > 6:
                        entry[cm.tel] = tel
                        del addr_list[-1]
                else:
                    entry[cm.tel] = tel
                    del addr_list[-1]
            entry[cm.addr_e] = ', '.join(addr_list)

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
예제 #24
0
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    start = body.find(ur'<div class="box-testuale-right">')
    if start == -1:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
    m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    entry[cm.addr_e] = cm.reformat_addr(m.group(1))
    m = re.search(ur'<h4>(.+?)</h4>', sub)
    if m is not None and 't:' in m.group(1).lower():
        entry[cm.tel] = cm.extract_tel(m.group(1))
    m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    ret = None
    if entry[cm.lat] != '' and entry[cm.lng] != '':
        ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
    if ret is None:
        ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone'])))
    if ret is not None:
        city = ''
        province = ''
        country = ''
        zip_code = ''
        tmp = ret[0]['address_components']
예제 #25
0
                    entry[cm.lng] = string.atof(s['latlong']['lng'])
                break

        store_sub = cm.extract_closure(sub[m.start():], ur'<li\b', ur'</li>')[0]

        m1 = re.search(ur'<div class="storelocator-item-title">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1)).strip()

        m1 = re.search(ur'<div class="storelocator-item-address">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1)).strip()

        m1 = re.search(ur'<div class="storelocator-item-phone">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.tel] = cm.extract_tel(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-fax">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.fax] = cm.extract_tel(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-email">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.email] = cm.extract_email(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-hours">([^<>]+)</div>', store_sub)
        if m1 is not None:
            entry[cm.hours] = m1.group(1).strip()

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
예제 #26
0
    for i in xrange(len(tmp) - 1):
        sub_list.append({
            'content': body1[tmp[i]['idx2']:tmp[i + 1]['idx1']],
            'name': tmp[i]['name']
        })

    for sub in sub_list:
        for m in re.findall(ur'<p>(.+?)</p>', sub['content'], re.S):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                        data['brandname_c'])
            entry[cm.country_e] = 'UNITED KINGDOM'
            entry[cm.city_e] = sub['name']

            addr_list = cm.reformat_addr(m).split(', ')
            entry[cm.addr_e] = ', '.join(addr_list[:-1])
            entry[cm.tel] = cm.extract_tel(addr_list[-1])
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (
                data['brandname_e'], data['brand_id'], entry[cm.name_e],
                entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
            db.insert_record(entry, 'stores')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    entry[cm.country_e] = 'UNITED KINGDOM'
    entry[cm.city_e] = u'EDINBURGH'
    entry[cm.addr_e] = u'OCEAN DRIVE, LEITH, EDINBURGH'
    entry[cm.tel] = u'0131 554 8622'

    for m in re.findall(ur'<p>(.+?)</p>', body3, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
예제 #27
0
        cm.dump('Error in fetching countries: %s' % url, log_name)
        return []
    body = cm.extract_closure(body[start + 6:], ur'\[', ur'\]')[0]

    store_list = []
    for m in re.finditer(ur'<div class="store ', body):
        s = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        m1 = re.search(ur'<h6>([^<>]+)</h6>', s)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()

        addr_sub = cm.extract_closure(s, ur'<p>', ur'</p>')[0]
        addr_list = [term.strip() for term in cm.reformat_addr(addr_sub).split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur'll=(-?\d+\.\d+),(-?\d+\.\d+)', addr_sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
예제 #28
0
            cm.dump('Error in parsing %s' % m.group(1), log_name)
            continue
        sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0]

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.store_type] = store_type

        m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

        entry[cm.tel] = cm.extract_tel(sub1)
        ret = gs.look_up(data['country_code'], 1)
        if ret is not None:
            entry[cm.country_e] = ret['name_e']
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
        if ret[2] is not None:
            entry[cm.city_e] = ret[2]
        else:
            entry[cm.city_e] = data['city'].strip().upper()

        if entry[cm.name_e] in latlng_map:
            tmp = latlng_map[entry[cm.name_e]]
            entry[cm.lat] = tmp['lat']
            entry[cm.lng] = tmp['lng']
예제 #29
0
    store_list = []
    for m in re.finditer(ur'<div class="store ', body):
        s = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0]
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])

        m1 = re.search(ur'<h6>([^<>]+)</h6>', s)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()

        addr_sub = cm.extract_closure(s, ur'<p>', ur'</p>')[0]
        addr_list = [
            term.strip() for term in cm.reformat_addr(addr_sub).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur'll=(-?\d+\.\d+),(-?\d+\.\d+)', addr_sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e])
        if ret[0] is not None and entry[cm.country_e] == '':
            entry[cm.country_e] = ret[0]
        if ret[1] is not None and entry[cm.province_e] == '':
예제 #30
0
        m1 = re.search(ur'<div class="storelocator-item-title">([^<>]+)</div>',
                       store_sub)
        if m1 is not None:
            entry[cm.name_e] = cm.html2plain(m1.group(1)).strip()

        m1 = re.search(
            ur'<div class="storelocator-item-address">([^<>]+)</div>',
            store_sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1)).strip()

        m1 = re.search(ur'<div class="storelocator-item-phone">([^<>]+)</div>',
                       store_sub)
        if m1 is not None:
            entry[cm.tel] = cm.extract_tel(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-fax">([^<>]+)</div>',
                       store_sub)
        if m1 is not None:
            entry[cm.fax] = cm.extract_tel(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-email">([^<>]+)</div>',
                       store_sub)
        if m1 is not None:
            entry[cm.email] = cm.extract_email(m1.group(1))

        m1 = re.search(ur'<div class="storelocator-item-hours">([^<>]+)</div>',
                       store_sub)
        if m1 is not None:
            entry[cm.hours] = m1.group(1).strip()
예제 #31
0
        if start == -1:
            cm.dump('Error in parsing %s' % m.group(1), log_name)
            continue
        sub1 = cm.extract_closure(sub[start:], ur'<div\b', ur'</div>')[0]

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.store_type] = store_type

        m1 = re.search(ur'<p class="store-item-name">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.name_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'<p class="store-item-adress">(.+?)</p>', sub1, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

        entry[cm.tel] = cm.extract_tel(sub1)
        ret = gs.look_up(data['country_code'], 1)
        if ret is not None:
            entry[cm.country_e] = ret['name_e']
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None:
            entry[cm.province_e] = ret[1]
        if ret[2] is not None:
            entry[cm.city_e] = ret[2]
        else:
            entry[cm.city_e] = data['city'].strip().upper()

        if entry[cm.name_e] in latlng_map:
            tmp = latlng_map[entry[cm.name_e]]
            entry[cm.lat] = tmp['lat']
            entry[cm.lng] = tmp['lng']
예제 #32
0
def fetch_stores(data):
    """
    country_id: country_id
    """
    country = data['country']
    country_id = data['country_id']
    city = data['city']
    city_id = data['city_id']

    try:
        html = cm.post_data(url, {'country_id': country_id, 'city_id': city_id})
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 2, 'time': cm.format_time(), 'data': data, 'brand_id': brand_id}
        cm.dump(dump_data)
        return []

    start = html.find('class="boutique_store"')
    if start == -1:
        return []
    end = html.find('</ul>', start)
    html = html[start:end]

    # <li><h6>Paris</h6><p>36 Avenue Montaigne<br />+33 1 47 20 04 45<br />France</p></li>
    stores = []
    for m in re.finditer(r'<li><h6>([^<>]+)</h6><p>(.*?)</p></li>', html):
        store_item = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        # city = m.group(1)
        content = m.group(2) + r'<br />'
        addr = ''
        idx = 0
        for m1 in re.finditer(r'(.*?)<br\s*?/>', content):
            idx += 1
            # 第一个为门店名称
            if idx == 1 and re.match(ur'.*?\d+', m1.group(1)) is None:
                store_item[cm.name_e] = cm.reformat_addr(m1.group(1))
                addr += m1.group(1) + '\r\n'
            else:
                # 是否为电话?
                tel_str = cm.extract_tel(m1.group(1))
                if tel_str != '':
                    store_item[cm.tel] = tel_str
                else:
                    addr += m1.group(1) + '\r\n'

        store_item[cm.addr_e] = cm.reformat_addr(addr)
        store_item[cm.city_e] = city
        store_item[cm.country_e] = country
        gs.field_sense(store_item)

        # term = cm.geo_translate(country)
        # if len(term) == 0:
        #     print 'Error in geo translating: %s' % country
        # else:
        #     store_item[cm.continent_c] = term[cm.continent_c]
        #     store_item[cm.continent_e] = term[cm.continent_e]
        #     store_item[cm.country_c] = term[cm.country_c]
        #     store_item[cm.country_e] = term[cm.country_e]
        # store_item[cm.brandname_e] = brandname_e
        # store_item[cm.brandname_c] = brandname_c
        # cm.chn_check(store_item)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store_item[cm.name_e], store_item[cm.addr_e], store_item[cm.country_e],
            store_item[cm.continent_e])
        db.insert_record(store_item, 'stores')
        stores.append(store_item)
예제 #33
0
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    start = body.find(ur'<div class="box-testuale-right">')
    if start == -1:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    sub = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')[0]
    m = re.search(ur'<div class="box-adress-store">(.+?)</div>', sub, re.S)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []
    entry[cm.addr_e] = cm.reformat_addr(m.group(1))
    m = re.search(ur'<h4>(.+?)</h4>', sub)
    if m is not None and 't:' in m.group(1).lower():
        entry[cm.tel] = cm.extract_tel(m.group(1))
    m = re.search(ur'<div class="box-open-store">(.+?)</div>', body, re.S)
    if m is not None:
        entry[cm.hours] = cm.reformat_addr(m.group(1))

    ret = None
    if entry[cm.lat] != '' and entry[cm.lng] != '':
        ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
    if ret is None:
        ret = gs.geocode(', '.join((entry[cm.addr_e], data['zone'])))
    if ret is not None:
        city = ''
        province = ''
        country = ''
        zip_code = ''
        tmp = ret[0]['address_components']
예제 #34
0
파일: max_co.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div\s+class\s*=\s*"storeItem"', body):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        sub = cm.extract_closure(body[m.end():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div class="bubbleInfo">(.+?)</div>', sub)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))
        m1 = re.search(ur'lat="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'lng="(-?\d+\.\d+)"', sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))
        m1 = re.search(ur'<span>\s*Tel:\s*([^<>]+)</span>', sub)
        if m1 is not None:
            entry[cm.tel] = m1.group(1).strip()
        m1 = re.search(ur'http://maps\.google\.com/maps\?q=([^&"]+)', sub)
        if m1 is None:
            continue
        ret = gs.geocode(latlng=m1.group(1))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'MAX' in tmp[0]:
                del tmp[0]
            if cm.extract_tel(tmp[-1])!='':
                del tmp[-1]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code
            gs.field_sense(entry)
            cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                                entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                                entry[cm.continent_e]), log_name)
            db.insert_record(entry, 'stores')
            store_list.append(entry)
        else:
            cm.dump('Error in fetching stores: latlng=%s, addr=%s' % (m1.group(1), entry[cm.addr_e]), log_name)
            continue
예제 #35
0
파일: agnesb.py 프로젝트: haizi-zh/firenze
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])

    addr_sub, info_sub = m.group(1).split('Practical Info')
    m = re.search(ur'<h2>(.+?)</h2>', addr_sub)
    if m is not None:
        entry[cm.name_e] = cm.html2plain(m.group(1))
    addr_list = []
    for term in re.findall(ur'<p>(.+?)</p>', addr_sub):
        tmp = cm.reformat_addr(term)
        if 'tel' in tmp.lower():
            tel = cm.extract_tel(tmp)
            if tel != '':
                entry[cm.tel] = tel
        elif 'fax' in tmp.lower():
            fax = cm.extract_tel(tmp)
            if fax != '':
                entry[cm.fax] = fax
        elif tmp != '':
            addr_list.append(tmp)
    entry[cm.addr_e] = ', '.join(addr_list)

    for term in (tmp.strip() for tmp in cm.reformat_addr(info_sub).split(',')):
        if '@' in term and '.' in term:
            entry[cm.email] = term
        elif 'www.' in term or '.com' in term or '.cn' in term:
            entry[cm.url] = term
예제 #36
0
파일: fcuk.py 프로젝트: haizi-zh/firenze
    for m in re.finditer(ur'<h3>\s*(.+?)\s*</h3>', body1):
        tmp.append({'idx1': m.start(), 'idx2': m.end(), 'name': m.group(1).strip().upper()})
    tmp.append({'idx1': -1})
    sub_list = []
    for i in xrange(len(tmp) - 1):
        sub_list.append({'content': body1[tmp[i]['idx2']:tmp[i + 1]['idx1']], 'name': tmp[i]['name']})

    for sub in sub_list:
        for m in re.findall(ur'<p>(.+?)</p>', sub['content'], re.S):
            entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
            entry[cm.country_e] = 'UNITED KINGDOM'
            entry[cm.city_e] = sub['name']

            addr_list = cm.reformat_addr(m).split(', ')
            entry[cm.addr_e] = ', '.join(addr_list[:-1])
            entry[cm.tel] = cm.extract_tel(addr_list[-1])
            gs.field_sense(entry)
            print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                              entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                              entry[cm.continent_e])
            db.insert_record(entry, 'stores')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
    entry[cm.country_e] = 'UNITED KINGDOM'
    entry[cm.city_e] = u'EDINBURGH'
    entry[cm.addr_e] = u'OCEAN DRIVE, LEITH, EDINBURGH'
    entry[cm.tel] = u'0131 554 8622'

    for m in re.findall(ur'<p>(.+?)</p>', body3, re.S):
        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = 'UNITED KINGDOM'
예제 #37
0
        m = re.search(ur'<span class="street-address">(.+?)</span>', sub, re.S)
        if m is not None:
            entry[cm.addr_e] = cm.reformat_addr(m.group(1))

        m = re.search(ur'<span class="postal-code">([^<>]+)</span>', sub, re.S)
        if m is not None:
            entry[cm.zip_code] = m.group(1).strip()

        m = re.search(ur'maps\.google\.com/\?q=(-?\d+\.?\d*),(-?\d+\.?\d*)', sub)
        if m is not None:
            entry[cm.lat] = string.atof(m.group(1))
            entry[cm.lng] = string.atof(m.group(2))

        for m in re.findall(ur'<div class="tel">(.+?)</div>', sub):
            if 'voice' in m:
                entry[cm.tel] = cm.extract_tel(cm.reformat_addr(m).replace('t.', ''))
            elif 'fax' in m:
                entry[cm.fax] = cm.extract_tel(cm.reformat_addr(m).replace('f.', ''))

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
예제 #38
0
파일: ysl.py 프로젝트: haizi-zh/firenze
    def f(m):
        store_name = m[0].strip()
        addr_str = m[1].strip()

        spl = addr_str.split('<br/>')
        store_type = cm.html2plain(spl[0].strip())

        store_addr = spl[1].strip()
        hour_idx = 2
        store_tel = ''
        for i in xrange(2, len(spl)):
            # If this is not a phone number:
            tel = cm.extract_tel(spl[i])
            if tel == '':
                store_addr += ', ' + spl[i]
                hour_idx = i + 1
            else:
                store_tel = spl[i].strip()
                hour_idx = i + 1
                break

        if hour_idx < len(spl):
            store_hour = cm.html2plain(', '.join(spl[hour_idx:])).strip()
        else:
            store_hour = ''

        # store_addr = cm.reformat_addr('\r\n'.join([val.strip() for val in spl[1:-3]]))
        store_addr = cm.reformat_addr(store_addr)

        store_entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        cm.update_entry(
            store_entry, {
                cm.continent_e: opt[cm.continent_e].strip().upper(),
                cm.city_e: opt[cm.city_e].strip().upper(),
                cm.country_e: opt[cm.country_e].strip().upper(),
                cm.name_e: cm.name_e,
                cm.addr_e: store_addr,
                cm.store_type: store_type,
                cm.hours: store_hour,
                cm.tel: store_tel
            })
        if opt.has_key(cm.province_e):
            store_entry[cm.province_e] = opt[cm.province_e]
        else:
            store_entry[cm.province_e] = ''
        store_entry[cm.city_e] = cm.extract_city(store_entry[cm.city_e])[0]

        gs.field_sense(store_entry)
        ret = gs.addr_sense(store_entry[cm.addr_e], store_entry[cm.country_e])
        if ret[1] is not None and store_entry[cm.province_e] == '':
            store_entry[cm.province_e] = ret[1]
        if ret[2] is not None and store_entry[cm.city_e] == '':
            store_entry[cm.city_e] = ret[2]
        gs.field_sense(store_entry)

        print '%s Found store: %s, %s (%s, %s)' % (
            brandname_e, store_entry[cm.name_e], store_entry[cm.addr_e],
            store_entry[cm.country_e], store_entry[cm.continent_e])
        db.insert_record(store_entry, 'stores')

        return store_entry
예제 #39
0
파일: furla.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = '%s%d/' % (data['store_url'], data['city_id'])
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {'level': 0, 'time': cm.format_time(), 'data': {'url': url}, 'brand_id': data['brand_id']}
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="store">', html):
        store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b', ur'</div')
        if set == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

        start = store_sub.find('<div class="store_name">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()

        start = store_sub.find('<div class="store_address">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b', ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub, re.S)
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(',')
            tmp = []
            tel_pat = re.compile(ur'^tel[\.: ]+', re.I)
            for term in addr_list:
                if re.search(tel_pat, term.strip()) is not None:
                    term = re.sub(tel_pat, '', term.strip())
                    entry[cm.tel] = cm.extract_tel(term)
                else:
                    tmp.append(term.strip())
            entry[cm.addr_e] = ', '.join(tmp)

        m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>', store_sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>', store_sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data[cm.country_e]
        entry[cm.continent_e] = data[cm.continent_e]
        entry[cm.city_e] = data[cm.city_e]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                          entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                          entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
예제 #40
0
        entry[cm.country_e] = data['country']
        entry[cm.province_c] = data['province']
        ret = gs.look_up(data['province'], 2)
        if ret is not None:
            entry[cm.province_e] = ret['name_e']
        entry[cm.city_c] = city
        ret = gs.look_up(city, 3)
        if ret is not None:
            entry[cm.city_e] = ret['name_e']

        entry[cm.name_e] = cm.reformat_addr(m.group(1))

        m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end():])
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(',')
            tel = cm.extract_tel(addr_list[-1]).strip()
            if tel != '':
                del addr_list[-1]
                entry[cm.tel] = tel
            entry[cm.addr_e] = ', '.join([tmp.strip() for tmp in addr_list])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == '':
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == '':
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
예제 #41
0
파일: maxmara.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="searchResult[^"]*"', body):
        if 'intro' in m.group():
            continue

        sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub)
        if m1 is None:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = data['city']

        addr_list = [tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        else:
            m1 = re.search(ur'Tel:([^<>]+)', sub)
            if m1 is not None:
                entry[cm.tel] = cm.extract_tel(m1.group(1))
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        start = sub.find(ur'Opening hours:')
        if start != -1:
            entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>', ur'</p>')[0].strip()

        ret = None
        if entry[cm.lat]!='' and entry[cm.lng]!='':
            ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'Max Mara' in tmp[0]:
                del tmp[0]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code

        gs.field_sense(entry)
        cm.dump('(%s / %d) Found store: %s, %s (%s, %s)' % (data['brandname_e'], data['brand_id'],
                                                            entry[cm.name_e], entry[cm.addr_e], entry[cm.country_e],
                                                            entry[cm.continent_e]), log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
예제 #42
0
 for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s,
                     re.S):
     if len(m.strip()) == 0:
         break
     for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>',
                          m):
         if len(m1.strip()) > 0:
             entry[cm.tel] = m1.strip()
         break
     for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>',
                          m):
         if len(m1.strip()) > 0:
             entry[cm.fax] = m1.strip()
         break
     if entry[cm.tel] == '' and entry[cm.fax] == '':
         entry[cm.tel] = cm.extract_tel(m.strip())
 for m in re.findall(
         ur'<p class="boutique-info-cadre-horaires">(.*?)</p>', s,
         re.S):
     if len(m.strip()) > 0:
         entry[cm.hours] = m.strip()
     break
 for m in re.findall(
         ur'<p class="boutique-info-cadre-adresse".*?>(.*?)</p>', s,
         re.S):
     if len(m.strip()) == 0:
         break
     street_addr = ''
     zip_code = ''
     city = ''
     country = ''
예제 #43
0
파일: folli.py 프로젝트: haizi-zh/firenze
def fetch_store_details(data):
    url = data['url']
    try:
        body = cm.post_data(url, {
            'cCode': data['country_code'],
            'city': data['city'],
            'postsearch': 1
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    start = body.find('<div class="store_locator')
    if start == -1:
        print 'Failed processing %s' % url
        return []
    sub, start, end = cm.extract_closure(body[start:], ur'<div\b', ur'</div>')

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                data['brandname_c'])
    m = re.search(ur'<p><span class="bold">Address</span>(.+?)</p>', sub, re.S)
    if m is not None:
        addr_list = cm.reformat_addr(m.group(1)).split(', ')
        ret = cm.extract_tel(addr_list[-1])
        if ret != '':
            entry[cm.tel] = ret
            del addr_list[-1]
        entry[cm.addr_e] = ', '.join(addr_list)

    addr_text = sub[m.end():]
    m = re.search(ur'<div class="title locator">', addr_text)
    if m is not None:
        tmp = cm.extract_closure(addr_text[m.start():], ur'<div\b',
                                 ur'</div>')[0]
        m1 = re.search(ur'<h2>(.+?)</h2>', tmp, re.S)
        if m1 is not None:
            entry[cm.addr_e] = cm.reformat_addr(m1.group(1))

    m = re.search(ur'google.maps.LatLng\(\s*(-?\d+\.\d+),\s*(-?\d+\.\d+)',
                  body, re.S)
    if m is not None:
        entry[cm.lat] = string.atof(m.group(1))
        entry[cm.lng] = string.atof(m.group(2))

    entry[cm.country_e] = data['country_code']
    entry[cm.city_e] = data['city']
    gs.field_sense(entry)
    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        data['brandname_e'], data['brand_id'], entry[cm.name_e],
        entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return [entry]
예제 #44
0
파일: furla.py 프로젝트: haizi-zh/firenze
def fetch_stores(data):
    url = '%s%d/' % (data['store_url'], data['city_id'])
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 0,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': data['brand_id']
        }
        cm.dump(dump_data)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="store">', html):
        store_sub, ss, se = cm.extract_closure(html[m.start():], ur'<div\b',
                                               ur'</div')
        if set == 0:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])

        start = store_sub.find('<div class="store_name">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub)
        if m1 is not None:
            entry[cm.name_e] = m1.group(1).strip()

        start = store_sub.find('<div class="store_address">')
        if start == -1:
            continue
        sub, start, end = cm.extract_closure(store_sub[start:], ur'<div\b',
                                             ur'</div>')
        if end == 0:
            continue
        m1 = re.search(ur'<p>(.+?)</p>', sub, re.S)
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(',')
            tmp = []
            tel_pat = re.compile(ur'^tel[\.: ]+', re.I)
            for term in addr_list:
                if re.search(tel_pat, term.strip()) is not None:
                    term = re.sub(tel_pat, '', term.strip())
                    entry[cm.tel] = cm.extract_tel(term)
                else:
                    tmp.append(term.strip())
            entry[cm.addr_e] = ', '.join(tmp)

        m1 = re.search(ur'<input\s.+?name="latitude"\s+value="(.+?)"\s*/>',
                       store_sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
        m1 = re.search(ur'<input\s.+?name="longitude"\s+value="(.+?)"\s*/>',
                       store_sub)
        if m1 is not None:
            entry[cm.lng] = string.atof(m1.group(1))

        entry[cm.country_e] = data[cm.country_e]
        entry[cm.continent_e] = data[cm.continent_e]
        entry[cm.city_e] = data[cm.city_e]
        gs.field_sense(entry)
        print '(%s / %d) Found store: %s, %s (%s, %s)' % (
            data['brandname_e'], data['brand_id'], entry[cm.name_e],
            entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e])
        store_list.append(entry)
        db.insert_record(entry, 'stores')
예제 #45
0
def fetch_stores(data):
    """
    country_id: country_id
    """
    country = data['country']
    country_id = data['country_id']
    city = data['city']
    city_id = data['city_id']

    try:
        html = cm.post_data(url, {
            'country_id': country_id,
            'city_id': city_id
        })
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 2,
            'time': cm.format_time(),
            'data': data,
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    start = html.find('class="boutique_store"')
    if start == -1:
        return []
    end = html.find('</ul>', start)
    html = html[start:end]

    # <li><h6>Paris</h6><p>36 Avenue Montaigne<br />+33 1 47 20 04 45<br />France</p></li>
    stores = []
    for m in re.finditer(r'<li><h6>([^<>]+)</h6><p>(.*?)</p></li>', html):
        store_item = cm.init_store_entry(brand_id, brandname_e, brandname_c)
        # city = m.group(1)
        content = m.group(2) + r'<br />'
        addr = ''
        idx = 0
        for m1 in re.finditer(r'(.*?)<br\s*?/>', content):
            idx += 1
            # 第一个为门店名称
            if idx == 1 and re.match(ur'.*?\d+', m1.group(1)) is None:
                store_item[cm.name_e] = cm.reformat_addr(m1.group(1))
                addr += m1.group(1) + '\r\n'
            else:
                # 是否为电话?
                tel_str = cm.extract_tel(m1.group(1))
                if tel_str != '':
                    store_item[cm.tel] = tel_str
                else:
                    addr += m1.group(1) + '\r\n'

        store_item[cm.addr_e] = cm.reformat_addr(addr)
        store_item[cm.city_e] = city
        store_item[cm.country_e] = country
        gs.field_sense(store_item)

        # term = cm.geo_translate(country)
        # if len(term) == 0:
        #     print 'Error in geo translating: %s' % country
        # else:
        #     store_item[cm.continent_c] = term[cm.continent_c]
        #     store_item[cm.continent_e] = term[cm.continent_e]
        #     store_item[cm.country_c] = term[cm.country_c]
        #     store_item[cm.country_e] = term[cm.country_e]
        # store_item[cm.brandname_e] = brandname_e
        # store_item[cm.brandname_c] = brandname_c
        # cm.chn_check(store_item)
        print '%s: Found store: %s, %s (%s, %s)' % (
            brandname_e, store_item[cm.name_e], store_item[cm.addr_e],
            store_item[cm.country_e], store_item[cm.continent_e])
        db.insert_record(store_item, 'stores')
        stores.append(store_item)
예제 #46
0
        entry[cm.country_e] = data["country"]
        entry[cm.province_c] = data["province"]
        ret = gs.look_up(data["province"], 2)
        if ret is not None:
            entry[cm.province_e] = ret["name_e"]
        entry[cm.city_c] = city
        ret = gs.look_up(city, 3)
        if ret is not None:
            entry[cm.city_e] = ret["name_e"]

        entry[cm.name_e] = cm.reformat_addr(m.group(1))

        m1 = re.search(ur'<span id="m_stlist"[^<>]*>(.+?)</span>', body[m.end() :])
        if m1 is not None:
            addr_list = cm.reformat_addr(m1.group(1)).split(",")
            tel = cm.extract_tel(addr_list[-1]).strip()
            if tel != "":
                del addr_list[-1]
                entry[cm.tel] = tel
            entry[cm.addr_e] = ", ".join([tmp.strip() for tmp in addr_list])

        gs.field_sense(entry)
        ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
        if ret[1] is not None and entry[cm.province_e] == "":
            entry[cm.province_e] = ret[1]
        if ret[2] is not None and entry[cm.city_e] == "":
            entry[cm.city_e] = ret[2]
        gs.field_sense(entry)

        cm.dump(
            "(%s / %d) Found store: %s, %s (%s, %s)"
예제 #47
0
    m3 = re.search(ur'<div id="mallhotel">([^<>]+)', body)
    val = cm.html2plain(m3.group(1)).strip() if m3 else ''
    if val != '':
        addr_list.append(val)
    m3 = re.search(ur'<div id="address1">([^<>]+)', body)
    val = cm.html2plain(m3.group(1)).strip() if m3 else ''
    if val != '':
        addr_list.append(val)
    m3 = re.search(ur'<div id="address2">([^<>]+)', body)
    val = cm.html2plain(m3.group(1)).strip() if m3 else ''
    if val != '':
        addr_list.append(val)
    entry[cm.addr_e] = ', '.join(addr_list)

    m = re.search(ur'<div id="phone">([^<>]+)</div>', body)
    entry[cm.tel] = cm.extract_tel(m.group(1)) if m else ''
    m = re.search(ur'<div id="fax">([^<>]+)</div>', body)
    entry[cm.fax] = cm.extract_tel(m.group(1)) if m else ''
    m = re.search(ur'<div id="email">([^<>]+)</div>', body)
    entry[cm.email] = m.group(1).strip() if m else ''

    m = re.search(ur'<div id="opening">', body)
    if m:
        hours_list = []
        for m in re.findall(ur'<li>([^<>]+)', cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]):
            if m.strip() != '':
                hours_list.append(m.strip())
        entry[cm.hours] = ', '.join(hours_list)

    m = re.search(ur'<div id="products">', body)
    if m:
예제 #48
0
def fetch_stores(data):
    url = data['url']
    try:
        body = cm.get_data(url)
    except Exception:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    store_list = []
    for m in re.finditer(ur'<div class="searchResult[^"]*"', body):
        if 'intro' in m.group():
            continue

        sub = cm.extract_closure(body[m.start():], ur'<div\b', ur'</div>')[0]
        m1 = re.search(ur'<div id=[^<>]+>(.+?)</div>', sub)
        if m1 is None:
            continue

        entry = cm.init_store_entry(data['brand_id'], data['brandname_e'],
                                    data['brandname_c'])
        entry[cm.country_e] = data['country']
        entry[cm.city_e] = data['city']

        addr_list = [
            tmp.strip() for tmp in cm.reformat_addr(m1.group(1)).split(',')
        ]
        tel = cm.extract_tel(addr_list[-1])
        if tel != '':
            entry[cm.tel] = tel
            del addr_list[-1]
        else:
            m1 = re.search(ur'Tel:([^<>]+)', sub)
            if m1 is not None:
                entry[cm.tel] = cm.extract_tel(m1.group(1))
        entry[cm.addr_e] = ', '.join(addr_list)

        m1 = re.search(ur"show_map\('(-?\d+\.\d+)'\s*,\s*'(-?\d+\.\d+)'", sub)
        if m1 is not None:
            entry[cm.lat] = string.atof(m1.group(1))
            entry[cm.lng] = string.atof(m1.group(2))

        start = sub.find(ur'Opening hours:')
        if start != -1:
            entry[cm.hours] = cm.extract_closure(sub[start:], ur'<p>',
                                                 ur'</p>')[0].strip()

        ret = None
        if entry[cm.lat] != '' and entry[cm.lng] != '':
            ret = gs.geocode(latlng='%f,%f' % (entry[cm.lat], entry[cm.lng]))
        if ret is None:
            tmp = [tmp1.strip() for tmp1 in entry[cm.addr_e].split(',')]
            if 'Max Mara' in tmp[0]:
                del tmp[0]
            if len(tmp) > 0:
                ret = gs.geocode(', '.join(tmp))
        if ret is not None:
            city = ''
            province = ''
            country = ''
            zip_code = ''
            tmp = ret[0]['address_components']
            for v in tmp:
                if 'locality' in v['types']:
                    city = v['long_name'].strip().upper()
                elif 'administrative_area_level_1' in v['types']:
                    province = v['long_name'].strip().upper()
                elif 'country' in v['types']:
                    country = v['long_name'].strip().upper()
                elif 'postal_code' in v['types']:
                    zip_code = v['long_name'].strip()
            entry[cm.country_e] = country
            entry[cm.province_e] = province
            entry[cm.city_e] = city
            entry[cm.zip_code] = zip_code

        gs.field_sense(entry)
        cm.dump(
            '(%s / %d) Found store: %s, %s (%s, %s)' %
            (data['brandname_e'], data['brand_id'], entry[cm.name_e],
             entry[cm.addr_e], entry[cm.country_e], entry[cm.continent_e]),
            log_name)
        db.insert_record(entry, 'stores')
        store_list.append(entry)
예제 #49
0
    store_list = []
    for m in sub_list:
        city_id = m['city_id']
        sub_html = m['html']
        entry = common.init_store_entry(brand_id, brandname_e, brandname_c)
        for m1 in re.findall(ur'<div class="store-desc">(.+?)</div>', sub_html, re.S):
            entry[common.name_e] = common.reformat_addr(m1)
            break

        for m1 in re.findall(ur'<div class="store-terminal">(.+?)</div>', sub_html, re.S):
            entry[common.addr_e] = common.reformat_addr(m1)
            break

        for m1 in re.findall(ur'<div class="store-tel">(.+?)</div>', sub_html, re.S):
            entry[common.tel] = common.extract_tel(m1)
            break

        for m1 in re.findall(ur'<div class="store-opening-hour">\s*?(?:Opening Hours:)?(.+?)</div>', sub_html,
                             re.S):
            entry[common.hours] = common.reformat_addr(m1)
            break

        m1 = re.findall(ur'href="/(.+?)" title="View on map"', sub_html)
        if len(m1) > 0:
            entry[common.url] = host + '/' + m1[0]
            lat, lng = get_coordinates(entry[common.url])
            common.update_entry(entry, {common.lat: lat, common.lng: lng})

        # geo
        city_e = cities[city_id]['name'].strip()
예제 #50
0
     if len(m.strip()) >= 0:
         entry[cm.store_type] = m.strip()
     break
 for m in re.findall(ur'<p class="boutique-info-cadre-tel">(.*)</p>', s, re.S):
     if len(m.strip()) == 0:
         break
     for m1 in re.findall(ur'<span itemprop="telephone">(.*?)</span>', m):
         if len(m1.strip()) > 0:
             entry[cm.tel] = m1.strip()
         break
     for m1 in re.findall(ur'<span itemprop="faxNumber">(.*?)</span>', m):
         if len(m1.strip()) > 0:
             entry[cm.fax] = m1.strip()
         break
     if entry[cm.tel] == '' and entry[cm.fax] == '':
         entry[cm.tel] = cm.extract_tel(m.strip())
 for m in re.findall(ur'<p class="boutique-info-cadre-horaires">(.*?)</p>', s, re.S):
     if len(m.strip()) > 0:
         entry[cm.hours] = m.strip()
     break
 for m in re.findall(ur'<p class="boutique-info-cadre-adresse".*?>(.*?)</p>', s, re.S):
     if len(m.strip()) == 0:
         break
     street_addr = ''
     zip_code = ''
     city = ''
     country = ''
     for m1 in re.findall(ur'<span itemprop="streetAddress">(.*?)</span>', m, re.S):
         if len(m1.strip()) > 0:
             street_addr = cm.reformat_addr(m1)
         break
예제 #51
0
def get_store_details(data):
    url = data['url']
    try:
        html = cm.get_data(url)
    except Exception:
        print 'Error occured: %s' % url
        dump_data = {
            'level': 1,
            'time': cm.format_time(),
            'data': {
                'url': url
            },
            'brand_id': brand_id
        }
        cm.dump(dump_data)
        return []

    entry = cm.init_store_entry(brand_id, brandname_e, brandname_c)
    entry[cm.name_e] = data['name']
    entry[cm.url] = data['url']
    start = html.find(ur'<div class="storelocator-breadcrumbs">')
    if start == -1:
        return []
    sub, start, end = cm.extract_closure(html[start:], ur'<ul>', ur'</ul>')
    if end == 0:
        return []
        # 最后一个<li>...</li>
    m = re.findall(ur'<li>(.+?)</li>', sub, re.S)
    if len(m) > 0:
        entry[cm.addr_e] = cm.reformat_addr(m[-1])
        # 经纬度
    m = re.findall(
        ur'position: new google.maps.LatLng\((-?\d+\.\d+).*?(-?\d+\.\d+)\)',
        html)
    if len(m) > 0:
        cm.update_entry(entry, {
            cm.lat: string.atof(m[0][0]),
            cm.lng: string.atof(m[0][1])
        })

    m = re.search(ur'<div class="contact right">(.+?)</div>', html, re.S)
    if m is not None:
        contact_sub = m.group(1)
        pat_tel = re.compile(ur'<p class="phone">(.+?)</p>')
        m1 = re.search(pat_tel, contact_sub)
        if m1:
            entry[cm.tel] = cm.extract_tel(m1.group(1))
            contact_sub = re.sub(pat_tel, '', contact_sub)
        hours_list = [
            tmp.strip() for tmp in cm.reformat_addr(contact_sub).split(',')
        ]
        if 'opening hours' in hours_list[0].lower():
            del hours_list[0]
        entry[cm.hours] = ', '.join(hours_list)

    # Geo
    country = data['country']
    city = data['city']
    cm.update_entry(entry, {cm.country_e: country, cm.city_e: city})
    gs.field_sense(entry)
    ret = gs.addr_sense(entry[cm.addr_e], entry[cm.country_e])
    if ret[1] is not None and entry[cm.province_e] == '':
        entry[cm.province_e] = ret[1]
    if ret[2] is not None and entry[cm.city_e] == '':
        entry[cm.city_e] = ret[2]
    gs.field_sense(entry)

    print '(%s / %d) Found store: %s, %s (%s, %s)' % (
        brandname_e, brand_id, entry[cm.name_e], entry[cm.addr_e],
        entry[cm.country_e], entry[cm.continent_e])
    db.insert_record(entry, 'stores')
    return entry
예제 #52
0
파일: agnesb.py 프로젝트: haizi-zh/firenze
    m = re.search(ur'<div id="coordonnees"[^<>]*>(.+?)</div>', body, re.S)
    if m is None:
        cm.dump('Error in fetching stores: %s' % url, log_name)
        return []

    entry = cm.init_store_entry(data['brand_id'], data['brandname_e'], data['brandname_c'])

    addr_sub, info_sub = m.group(1).split('Practical Info')
    m = re.search(ur'<h2>(.+?)</h2>', addr_sub)
    if m is not None:
        entry[cm.name_e] = cm.html2plain(m.group(1))
    addr_list = []
    for term in re.findall(ur'<p>(.+?)</p>', addr_sub):
        tmp = cm.reformat_addr(term)
        if 'tel' in tmp.lower():
            tel = cm.extract_tel(tmp)
            if tel != '':
                entry[cm.tel] = tel
        elif 'fax' in tmp.lower():
            fax = cm.extract_tel(tmp)
            if fax != '':
                entry[cm.fax] = fax
        elif tmp != '':
            addr_list.append(tmp)
    entry[cm.addr_e] = ', '.join(addr_list)

    for term in (tmp.strip() for tmp in cm.reformat_addr(info_sub).split(',')):
        if '@' in term and '.' in term:
            entry[cm.email] = term
        elif 'www.' in term or '.com' in term or '.cn' in term:
            entry[cm.url] = term