Beispiel #1
0
def get_monthly_sales_promotion_page_source(item_id,
                                            use_proxy=False,
                                            proxies=None):
    _url = 'https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?data=%7B%22itemNumId%22%3A%22{}%22%7D' \
           .format(item_id)
    page_source = get_page_source(_url, use_proxy, proxies)
    return page_source
Beispiel #2
0
def get_specs_additional_urls_page_source(page_source,
                                          use_proxy=False,
                                          proxies=None):
    """
    :param use_proxy: 是否使用代理
    :param proxies: 代理
    :param page_source: 商品网页源代码
    :return: specs: 产品规格说明列表
    """
    specs_additional_urls_page_source = ''
    pat = re.compile(r'"httpsDescUrl":"(.*?)",')
    result = pat.findall(page_source)
    if result:
        _url = 'https:' + result[0]
        specs_additional_urls_page_source = get_page_source(
            _url, use_proxy, proxies)
    return specs_additional_urls_page_source
Beispiel #3
0
def get_accumulated_reviews(item_id, use_proxy=False, proxies=None):
    """
    :param item_id: 商品ID
    :param use_proxy: 是否使用代理
    :param proxies: 代理
    :return: accumulated_reviews: 累计评价
    """
    accumulated_reviews = ''
    accumulated_reviews_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + item_id
    page_source = get_page_source(accumulated_reviews_url, use_proxy, proxies)
    if not page_source:
        return accumulated_reviews
    pat = re.compile(r'"rateTotal":(\d+)')
    result = pat.findall(page_source)
    if result:
        accumulated_reviews = result[0]
    return accumulated_reviews
Beispiel #4
0
def get_favorites(page_source):
    """
    :param page_source: 商品网页源代码
    :return: favorites: 人气
    """
    favorites = ''
    pat = re.compile(r'"apiBeans":"(.*)","idsMod"')
    favorites_url = pat.findall(page_source)
    if not favorites_url:
        return favorites
    api_beans = 'https:' + favorites_url[0]
    icp = api_beans.split(',')[-1]
    api = api_beans + '&callback=json'
    favorites_page_source = get_page_source(api)
    if not favorites_page_source:
        return favorites
    pat = re.compile(r'"{}":(\d+)'.format(icp))
    result = pat.findall(favorites_page_source)
    if result:
        favorites = result[0]
    return favorites
Beispiel #5
0
def main_detail(category, category_id, item_id, title, api):
    proxy, used = give_me_proxy()
    if proxy:
        message = proxy.get('https') + ' is used for ' + api
        logger.info(message)

    page_source = get_page_source(api, use_proxy=used, proxies=proxy)
    if not page_source:
        return

    tree = get_tree(page_source)

    colors = get_colors(tree)

    tag_page_source = get_tag_page_source(item_id,
                                          use_proxy=used,
                                          proxies=proxy)
    accumulated_reviews = get_accumulated_reviews(item_id,
                                                  use_proxy=used,
                                                  proxies=proxy)
    monthly_sales_promotion_page = get_monthly_sales_promotion_page_source(
        item_id, use_proxy=used, proxies=proxy)
    specs_additional_urls_page = get_specs_additional_urls_page_source(
        page_source, use_proxy=used, proxies=proxy)

    inventory = get_inventory(page_source)
    favorites = get_favorites(page_source)
    list_price = get_list_price(page_source)
    description = get_description(page_source)
    product_skus = get_product_skus(page_source)
    seller_region = get_seller_region(page_source)
    seller_nickname = get_seller_nickname(page_source)

    tag = get_tag(tag_page_source)
    tag_count = get_tag_count(tag_page_source)

    promotion = get_promotion(monthly_sales_promotion_page)
    monthly_sales = get_monthly_sales(monthly_sales_promotion_page)

    specs = get_specs(specs_additional_urls_page)
    additional_urls = get_additional_urls(specs_additional_urls_page)

    data = [
        category,
        item_id,
        title,
        inventory,
        monthly_sales,
        favorites,
        list_price,
        promotion,
        accumulated_reviews,
        tag_count,
        tag,
        colors,
        specs,
        title,
        description,
        api,
        seller_nickname,
        seller_region,
        additional_urls,
        product_skus,
        category_id,
    ]
    detail_all.append(data)
    logger.info('crawl {} succeed'.format(api))
    print(data)
    return data
Beispiel #6
0
def get_tag_page_source(item_id, use_proxy=False, proxies=None):
    tag_url = 'https://rate.tmall.com/listTagClouds.htm?itemId={}&isAll=true'.format(
        item_id)
    page_source = get_page_source(tag_url, use_proxy, proxies)
    return page_source