Exemple #1
0
def get_image_path(brand_id):
    brand_name = cm.norm_brand_name(
        cm.fetch_brand_by_id(brand_id)['brandname_e'])
    return {
        'full':
        os.path.join(base_path, 'images',
                     str.format('{0}_{1}', brand_id, brand_name), 'full'),
        'thumb':
        os.path.join(base_path, 'images',
                     str.format('{0}_{1}', brand_id, brand_name), 'thumb')
    }
Exemple #2
0
def update_tags_mapping(brand_id, region, tag_raw, tag_name, serialize=True):
    """
    更新tags_mapping映射机制。根据区域不同,在标签的源代码表象和标签的展示表象之间,建立映射关系。
    :param brand_id:
    :param region:
    :param tag_raw:
    :param tag_name:
    :param serialize: 是否更新数据文件
    """
    brand_name = cm.norm_brand_name(
        cm.fetch_brand_by_id(brand_id)['brandname_e'])
    data_dir = get_data_path(brand_id)
    region = region.lower()
    fname = os.path.normpath(
        os.path.join(
            data_dir,
            str.format('{0}_{1}_{2}_tags_mapping.json', brand_id, brand_name,
                       region)))

    if brand_id not in tags_mapping:
        try:
            with open(fname, 'r') as f:
                data = json.load(f, encoding='utf-8')
        except ValueError:
            data = {}
        except IOError:
            data = {}
        tags_mapping[brand_id] = data
    else:
        data = tags_mapping[brand_id]

    tag_raw = tag_raw.encode('utf-8') if isinstance(tag_raw,
                                                    unicode) else tag_raw
    tag_name = tag_name.encode('utf-8') if isinstance(tag_name,
                                                      unicode) else tag_name
    data[tag_raw] = tag_name
    if serialize:
        cm.make_sure_path_exists(data_dir)
        with open(fname, 'w') as f:
            json.dump(data, f, ensure_ascii=False, encoding='utf-8')
Exemple #3
0
def fetch_products(region, category, gender, refresh_post_data=False):
    """
    抓取单品信息
    """
    # 获得过滤器的信息
    brand_name = cm.norm_brand_name(
        cm.fetch_brand_by_id(brand_id)['brandname_e'])
    data_dir = get_data_path(brand_id)
    cm.make_sure_path_exists(data_dir)
    fname = os.path.normpath(
        os.path.join(
            data_dir,
            str.format('{0}_{1}_{2}_{3}_{4}.json', brand_id, brand_name,
                       category.replace('/', '_'), gender, region)))
    if not os.path.isfile(fname) or refresh_post_data:
        logger.info(
            str.format('Fetch filter set for {0}, {1}', category,
                       gender).decode('utf-8'))
        filter_combinations = fetch_filter(region, category, gender, 0, {
            'post_data': basic_query.copy(),
            'tags': {},
            'processed': False
        })

        post_data = basic_query.copy()
        post_data[
            "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageId"] = category
        post_data[
            "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.gender"] = gender
        filter_combinations.append({
            'post_data': post_data,
            'processed': False,
            'tags': {}
        })
        with open(fname, 'w') as f:
            json.dump(filter_combinations, f)
    else:
        with open(fname, 'r') as f:
            filter_combinations = json.load(f, encoding='utf-8')

    processed_urls = set([])
    for filter_data in filter_combinations:
        # 跳过已经处理过的post数据
        if filter_data['processed']:
            continue

        filter_data['tags']['brand_id'] = 10226
        filter_data['tags']['brandname_e'] = 'Louis Vuitton'
        filter_data['tags']['brandname_c'] = '路易威登'
        filter_data['tags']['category'] = category
        filter_data['tags']['gender'] = gender
        filter_data['tags']['region'] = region
        page = 1
        while True:
            filter_data['post_data'][
                "/vuitton/ecommerce/commerce/catalog/FindProductsFormHandler.pageNumber"] = page
            response = cm.retry_helper(lambda val: cm.post_data(
                url=val, data=filter_data['post_data'], client="iPad"),
                                       param=hosts["data_host"][region] +
                                       gender,
                                       logger=logger,
                                       except_class=(URLError, socket.timeout),
                                       retry_delay=10)
            if response is None:
                continue
                # 得到单品的列表
            product_list = pq(response['body'])('li[data-url]')
            if len(product_list) == 0:
                break
                # logger.info(str.format('{0} products found at page {1}', len(product_list), page).decode('utf-8'))
            page += 1
            for item in product_list:
                url_component = item.attrib['data-url']
                m = re.search(r'[^-]+$', url_component)
                if m is None:
                    continue
                url = item.attrib['data-url']
                if url in processed_urls:
                    continue
                else:
                    processed_urls.add(url)
                    fetch_product_details(region, url, filter_data)

        filter_data['processed'] = True
        with open(fname, 'w') as f:
            json.dump(filter_combinations, f)
Exemple #4
0
def fetch_image(body, model, refetch=False):
    """
    抓取单品详情页面中的相关图片,并保存在数据库中。
    :param body:
    :param model:
    :param retry:
    :param cool_time:
    :param refetch: 是否强制重新抓取图片
    """
    temp = get_image_path(brand_id)
    image_dir = temp['full']
    image_thumb_dir = temp['thumb']
    brand_name = cm.norm_brand_name(
        cm.fetch_brand_by_id(brand_id)['brandname_e'])
    cm.make_sure_path_exists(image_dir)
    cm.make_sure_path_exists(image_thumb_dir)

    results = []
    for img_body in pq(body)('#productSheetSlideshow ul.bigs li img'):
        temp = img_body.attrib[
            'data-src'] if 'data-src' in img_body.attrib else (
                img_body.attrib['src'] if 'src' in img_body.attrib else '')
        mt = re.search(
            ur'RENDITIONS\["tablet"\]\["productMain"\]\s*=\s*\'([^\']+)\'',
            body)
        if not mt:
            continue
        jcr = mt.group(1)
        base_name = os.path.splitext(os.path.split(temp)[1])[0]
        if re.search(r'^http://', temp) is None:
            url = hosts['image_host'] + temp
        else:
            url = temp
        url_thumb = unicode.format(u'{0}/jcr:content/renditions/{1}_{2}.jpg',
                                   url, base_name, jcr)
        m = re.search(r'([^/]+$)', url)
        if m is None:
            continue

        # flist = tuple(os.listdir(image_dir))
        # if refetch or fname not in flist:

        response = fetch_image(url_thumb, logger)
        if response is None or len(response['body']) == 0:
            continue
            # 写入图片文件

        # fname = str.format('{0}_{1}_{2}_{3}', brand_id, brand_name, model, m.group())
        fname = str.format('{0}.{1}',
                           hashlib.sha1(url_thumb).hexdigest(),
                           response['image_ext'])
        full_name = os.path.normpath(os.path.join(image_dir, fname))
        path_db = os.path.normpath(
            os.path.join('10226_louis_vuitton/full', fname))

        with open(full_name, 'wb') as f:
            f.write(response['body'])
        buf = response['body']

        # else:
        #     with open(full_name, 'rb') as f:
        #         buf = f.read()

        md5 = hashlib.md5()
        md5.update(buf)
        checksum = md5.hexdigest()

        results.append([
            'True', {
                'checksum': checksum,
                'url': url_thumb,
                'path': str.format('full/{0}', fname)
            }
        ])

        # db.start_transaction()
        # try:
        #     # If the file already exists
        #     rs = db.query(
        #         str.format('SELECT path,width,height,format,url FROM products_image WHERE checksum="{0}"',
        #                    checksum)).fetch_row(how=1)
        #     if rs:
        #         path_db = cm.unicodify(rs[0]['path'])
        #         width = rs[0]['width']
        #         height = rs[0]['height']
        #         fmt = rs[0]['format']
        #         url = rs[0]['url']
        #     else:
        #         img = Image.open(full_name)
        #         width, height = img.size
        #         fmt = img.format
        #         url = url_thumb
        #
        #     rs = db.query(str.format('SELECT * FROM products_image WHERE path="{0}" AND model="{1}"', path_db,
        #                              model)).fetch_row(maxrows=0)
        #     if not rs:
        #         db.insert({'model': model, 'url': url, 'path': path_db, 'width': width,
        #                    'height': height, 'format': fmt, 'brand_id': brand_id, 'checksum': checksum},
        #                   'products_image', ['fetch_time', 'update_time'])
        #
        #     db.commit()
        # except:
        #     db.rollback()
        #     raise

    return results
Exemple #5
0
def get_data_path(brand_id):
    brand_name = cm.norm_brand_name(
        cm.fetch_brand_by_id(brand_id)['brandname_e'])
    return os.path.normpath(
        os.path.join(base_path, 'data',
                     str.format('{0}_{1}', brand_id, brand_name)))