Example #1
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url)

    category_items = []

    if root_json is not None:
        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            page_json = requester.get_json_dict(page_url)
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)

    return category_items
Example #2
0
def crawl_only_price_section():
    root_url = goods_section_root_url()
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url)

    csgo_items = []

    if root_json is not None:
        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(page_num)
            page_json = requester.get_json_dict(page_url)
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        csgo_items.append(csgo_item)

    enrich_item_with_price_history(csgo_items)
    return persist_util.tabulate(csgo_items)
Example #3
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, buff_cookies)

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
            log.error('Error happens:')
            log.error(root_json)
            if 'error' in root_json:
                log.error('Error field: ' + root_json['error'])
            log.error('Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE)
            exit(1)

        if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']):
            log.error("No specific page and count info for root page. Please check buff data structure.")

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']

        # buff有个page_size参数,默认一页请求20个item,最多80
        # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里
        use_max_page_size = True
        max_page_size = 80
        default_page_size = 20

        # 使用80一页后,新的页码
        if use_max_page_size:
            total_page = math.ceil(total_count / max_page_size)

        log.info('Totally {} items of {} pages to crawl.'.format(total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(
                category, page_num,
                page_size=max_page_size if use_max_page_size else default_page_size
            )
            page_json = get_json_dict(page_url, buff_cookies)
            if (page_json is not None) and ('data' in page_json) and ('items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
            else:
                log.warn("No specific data for page {}. Skip this page.".format(page_url))

    return category_items
Example #4
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, buff_cookies)

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
            log.error('Error happens:')
            log.error(root_json)
            if 'error' in root_json:
                log.error('Error field: ' + root_json['error'])
            log.error(
                'Please paste correct buff cookie to config, current cookie:' +
                BUFF_COOKIE)
            exit(1)

        if ('total_page'
                not in root_json['data']) or ('total_count'
                                              not in root_json['data']):
            log.error(
                "No specific page and count info for root page. Please check buff data structure."
            )

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            page_json = get_json_dict(page_url, buff_cookies)
            if (page_json is not None) and ('data' in page_json) and (
                    'items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
            else:
                log.warn(
                    "No specific data for page {}. Skip this page.".format(
                        page_url))

    return category_items
Example #5
0
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    steam_price_url = steam_price_history_url(item)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))
    steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True)

    # key existence check
    if (steam_history_prices is not None) and ('prices'
                                               in steam_history_prices):
        raw_price_history = steam_history_prices['prices']
        if len(raw_price_history) > 0:
            days = min((datetime.today().date() - datetime.strptime(
                raw_price_history[0][0], '%b %d %Y %H: +0').date()).days, 7)
        else:
            days = 0
        for pair in reversed(raw_price_history):
            if len(pair) == 3:
                for i in range(0, int(pair[2])):
                    history_prices.append(float(pair[1]))
            if (datetime.today().date() - datetime.strptime(
                    pair[0], '%b %d %Y %H: +0').date()).days > days:
                break

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
Example #6
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url)

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
            log.info('Error happens!')
            log.info('网站返回信息:')
            log.info(root_json)
            if 'error' in root_json:
                log.info('错误为: ' + root_json['error'])
            log.info('如果是登录问题,请先在浏览器登录buff,再粘贴正确的cookie到程序中。当前粘贴的cookie为:' +
                     COOKIE)
            sys.exit(1)

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            page_json = requester.get_json_dict(page_url)
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)

    return category_items
Example #7
0
def collect_single_category(category):
    csgo_category_item = []

    category_url = category_root_url(category)
    log.info("GET({}): {}".format(category, category_url))
    category_json = requester.get_json_dict(category_url)

    # return if request timeout
    if category_json is None:
        return csgo_category_item

    total_page = category_json['data']['total_page']
    total_count = category_json['data']['total_count']

    for page_num in range(1, total_page + 1):
        url = category_page_url(page_num, category)
        page_items = requester.get_json_dict(url)

        # return if request timeout
        if page_items is None:
            log.error('Timeout for page {} of {}. SKIP'.format(
                page_num, category))
            continue

        current_count = page_items['data']['page_size']
        log.info("GET({} page {}/{}, item {}/{}): {}".format(
            category, page_num, total_page, current_count, total_count, url))

        items = page_items['data']['items']
        for item in items:
            csgo_item = collect_item(item)
            if csgo_item is not None:
                csgo_category_item.append(csgo_item)

    log.info("Finish parsing category {}. Total effective items: {}\n".format(
        category, len(csgo_category_item)))
    return csgo_category_item
def crawl_item_history_price(index, item, total_price_number,proxy):
    history_prices = []

    item_id = item.id
    steam_price_url = steam_price_history_url(item_id)
    #从item.id获取对应的steam价格接口api
    log.info('GET {} 的steam价格信息 处理序列 第{}个/共{}个 : steam对应价格api接口 {}'.format(item.name, index, total_price_number,  steam_price_url))
    steam_history_prices = requester.get_json_dict(steam_price_url,proxy)
    """
    json格式如下:
    {
      "code": "OK", 
      "data": {
        "currency": "\u4eba\u6c11\u5e01", 
        "currency_symbol": "\u00a5", 
        "days": 7, 
        "price_history": [
                            [
                                1587834000000, 
                                180.94
                            ], 
                         ], 
        "price_type": "Steam\u4ef7\u683c", 
        "steam_price_currency": "\u5143"
      }, 
      "msg": null
    }
    """
    if steam_history_prices is not None:
        days = steam_history_prices['data']['days']
        raw_price_history = steam_history_prices['data']['price_history']
        for pair in raw_price_history:
            if len(pair) == 2:
                history_prices.append(float(pair[1]) * DOLLAR_TO_CNY)
                #获取历史记录列表

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)
            #为item设置历史价格,在其item类定义中,还会计算其他如 平均价格等参数

        log.info('{} 在最近 {} 天里有共 {} 件交易记录 \n'.format(item.name,days,len(history_prices)))
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    steam_price_url = steam_price_history_url(item)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))

    # (同步爬取下引入is_steam_request降低了steam market的爬取间隔)
    steam_history_prices = get_json_dict(steam_price_url,
                                         steam_cookies,
                                         is_steam_request=1)

    # key existence check
    if (steam_history_prices is not None) and ('prices'
                                               in steam_history_prices):
        days = key_existence_check(item, history_prices, steam_history_prices)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
Example #10
0
def crawl_item_history_price(index, item, total_price_number):
    history_prices = []

    item_id = item.id
    steam_price_url = steam_price_history_url(item_id)
    log.info('GET steam history price {}/{} for ({}): {}'.format(
        index, total_price_number, item.name, steam_price_url))
    steam_history_prices = requester.get_json_dict(steam_price_url)

    if steam_history_prices is not None:
        days = steam_history_prices['data']['days']
        raw_price_history = steam_history_prices['data']['price_history']
        for pair in raw_price_history:
            if len(pair) == 2:
                history_prices.append(float(pair[1]) * DOLLAR_TO_CNY)

        # set history price if exist
        if len(history_prices) != 0:
            item.set_history_prices(history_prices, days)

        log.info(
            'totally {} pieces of price history in {} days for {}\n'.format(
                len(history_prices), days, item.name))
Example #11
0
async def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, config.BUFF_COOKIE)
    category_items = []

    tasks = []
    timeout = aiohttp.ClientTimeout(total=30 * 60)
    if config.PROXY:
        # use socks
        connector = ProxyConnector.from_url(config.PROXY, limit=5)
    else:
        connector = aiohttp.TCPConnector(limit=5)

    if 'data' not in root_json:
        log.error('Error happens:')
        log.error(root_json)
        if 'error' in root_json:
            log.error('Error field: ' + root_json['error'])
        log.error(
            'Please paste correct buff cookie to config, current cookie:' +
            str(config.BUFF_COOKIE))
        return None

    if ('total_page' not in root_json['data']) or ('total_count'
                                                   not in root_json['data']):
        log.error(
            "No specific page and count info for root page. Please check buff data structure."
        )

    total_page = root_json['data']['total_page']
    total_count = root_json['data']['total_count']

    # buff有个page_size参数,默认一页请求20个item,最多80
    # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里
    use_max_page_size = True
    max_page_size = 80
    default_page_size = 20

    # 使用80一页后,新的页码
    if use_max_page_size:
        total_page = math.ceil(total_count / max_page_size)

    log.info('Totally {} items of {} pages to crawl.'.format(
        total_count, total_page))
    async with aiohttp.ClientSession(cookies=config.STEAM_COOKIE,
                                     headers=get_headers(),
                                     connector=connector,
                                     timeout=timeout) as session:
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(
                category,
                page_num,
                page_size=max_page_size
                if use_max_page_size else default_page_size)
            page_json = get_json_dict(page_url, config.BUFF_COOKIE)
            if (page_json is not None) and ('data' in page_json) and (
                    'items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
                        try:
                            tasks.append(
                                async_crawl_item_history_price(
                                    len(category_items), category_items[-1],
                                    session))
                        except Exception as e:
                            log.error(traceback.format_exc())

                stamp = time.time()
                try:
                    await asyncio.gather(*tasks)
                except Exception as e:
                    log.error(traceback.format_exc())
                tasks = []
                if not exist(page_url):
                    await timer.async_sleep_awhile(0, time.time() - stamp)
            else:
                log.warn(
                    "No specific data for page {}. Skip this page.".format(
                        page_url))
    return category_items
Example #12
0
def crawl_goods_by_price_section(category=None):
    """
    针对category这一类商品进行价格搜索
    手段是通过buff自带的api接口返回的json
    如果catefory = None 则对全物品进行搜索
    """
    proxy = next(cycle(proxies))
    root_url = goods_section_root_url(category)
    #category对应物品名称
    #根据设置的最高最低价界限
    #获取搜索的根目录,即page = 0 的 api接口
    #由于buff的bug,总搜索结果需要用一个极大的数字去获取,故区分于goods_section_page_url
    log.info('GET: {}'.format(root_url))

    root_json = requester.get_json_dict(root_url,proxy)
    #从api接口中获取json
    """
    json的基本格式如下
    {
      "code": "OK", 
      "data": {
        "items": [], 
        "page_num": 6, 
        "page_size": 20, 
        "total_count": 0, 
        "total_page": 0
      }, 
      "msg": null
    }
    """

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
        #如果json中没有数据.报错
            log.info('Error happens!')
            log.info('网站返回信息:')
            log.info(root_json)
            if 'error' in root_json:
            #返回error情况
                log.info('错误为: ' + root_json['error'])
            log.info('如果是登录问题,请先在浏览器登录buff,再粘贴正确的cookie到程序中。当前粘贴的cookie为:' + COOKIE)
            sys.exit(1)

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        if total_count == 0:
            log.info('{} 没有符合要求的item!'.format(category))
        else:
            log.info('共有{}件物品item满足爬取条件,一共{}页'.format(total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('第 {} 页 / 共 {} 页'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            #获取 每页的搜索结果
            proxy = next(cycle(proxies))
            page_json = requester.get_json_dict(page_url,proxy)
            #获取 每页的json目录
            if page_json is not None:
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    #从json目录中获取item的价格 名称 等数据
                    #csgo_item是一个特殊的类,其包含各种参数,价格名称等
                    if csgo_item is not None:
                        category_items.append(csgo_item)
                        #将满足条件的item添加到category_item中

    return category_items