def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url) category_items = [] if root_json is not None: total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) page_json = requester.get_json_dict(page_url) if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) return category_items
def crawl_only_price_section(): root_url = goods_section_root_url() log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url) csgo_items = [] if root_json is not None: total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(page_num) page_json = requester.get_json_dict(page_url) if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: csgo_items.append(csgo_item) enrich_item_with_price_history(csgo_items) return persist_util.tabulate(csgo_items)
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, buff_cookies) category_items = [] if root_json is not None: if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error('Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE) exit(1) if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error("No specific page and count info for root page. Please check buff data structure.") total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] # buff有个page_size参数,默认一页请求20个item,最多80 # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里 use_max_page_size = True max_page_size = 80 default_page_size = 20 # 使用80一页后,新的页码 if use_max_page_size: total_page = math.ceil(total_count / max_page_size) log.info('Totally {} items of {} pages to crawl.'.format(total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url( category, page_num, page_size=max_page_size if use_max_page_size else default_page_size ) page_json = get_json_dict(page_url, buff_cookies) if (page_json is not None) and ('data' in page_json) and ('items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) else: log.warn("No specific data for page {}. Skip this page.".format(page_url)) return category_items
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, buff_cookies) category_items = [] if root_json is not None: if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error( 'Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE) exit(1) if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error( "No specific page and count info for root page. Please check buff data structure." ) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) page_json = get_json_dict(page_url, buff_cookies) if (page_json is not None) and ('data' in page_json) and ( 'items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) else: log.warn( "No specific data for page {}. Skip this page.".format( page_url)) return category_items
def crawl_item_history_price(index, item, total_price_number): history_prices = [] steam_price_url = steam_price_history_url(item) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, True) # key existence check if (steam_history_prices is not None) and ('prices' in steam_history_prices): raw_price_history = steam_history_prices['prices'] if len(raw_price_history) > 0: days = min((datetime.today().date() - datetime.strptime( raw_price_history[0][0], '%b %d %Y %H: +0').date()).days, 7) else: days = 0 for pair in reversed(raw_price_history): if len(pair) == 3: for i in range(0, int(pair[2])): history_prices.append(float(pair[1])) if (datetime.today().date() - datetime.strptime( pair[0], '%b %d %Y %H: +0').date()).days > days: break # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url) category_items = [] if root_json is not None: if 'data' not in root_json: log.info('Error happens!') log.info('网站返回信息:') log.info(root_json) if 'error' in root_json: log.info('错误为: ' + root_json['error']) log.info('如果是登录问题,请先在浏览器登录buff,再粘贴正确的cookie到程序中。当前粘贴的cookie为:' + COOKIE) sys.exit(1) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) page_json = requester.get_json_dict(page_url) if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) return category_items
def collect_single_category(category): csgo_category_item = [] category_url = category_root_url(category) log.info("GET({}): {}".format(category, category_url)) category_json = requester.get_json_dict(category_url) # return if request timeout if category_json is None: return csgo_category_item total_page = category_json['data']['total_page'] total_count = category_json['data']['total_count'] for page_num in range(1, total_page + 1): url = category_page_url(page_num, category) page_items = requester.get_json_dict(url) # return if request timeout if page_items is None: log.error('Timeout for page {} of {}. SKIP'.format( page_num, category)) continue current_count = page_items['data']['page_size'] log.info("GET({} page {}/{}, item {}/{}): {}".format( category, page_num, total_page, current_count, total_count, url)) items = page_items['data']['items'] for item in items: csgo_item = collect_item(item) if csgo_item is not None: csgo_category_item.append(csgo_item) log.info("Finish parsing category {}. Total effective items: {}\n".format( category, len(csgo_category_item))) return csgo_category_item
def crawl_item_history_price(index, item, total_price_number,proxy): history_prices = [] item_id = item.id steam_price_url = steam_price_history_url(item_id) #从item.id获取对应的steam价格接口api log.info('GET {} 的steam价格信息 处理序列 第{}个/共{}个 : steam对应价格api接口 {}'.format(item.name, index, total_price_number, steam_price_url)) steam_history_prices = requester.get_json_dict(steam_price_url,proxy) """ json格式如下: { "code": "OK", "data": { "currency": "\u4eba\u6c11\u5e01", "currency_symbol": "\u00a5", "days": 7, "price_history": [ [ 1587834000000, 180.94 ], ], "price_type": "Steam\u4ef7\u683c", "steam_price_currency": "\u5143" }, "msg": null } """ if steam_history_prices is not None: days = steam_history_prices['data']['days'] raw_price_history = steam_history_prices['data']['price_history'] for pair in raw_price_history: if len(pair) == 2: history_prices.append(float(pair[1]) * DOLLAR_TO_CNY) #获取历史记录列表 # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) #为item设置历史价格,在其item类定义中,还会计算其他如 平均价格等参数 log.info('{} 在最近 {} 天里有共 {} 件交易记录 \n'.format(item.name,days,len(history_prices)))
def crawl_item_history_price(index, item, total_price_number): history_prices = [] steam_price_url = steam_price_history_url(item) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) # (同步爬取下引入is_steam_request降低了steam market的爬取间隔) steam_history_prices = get_json_dict(steam_price_url, steam_cookies, is_steam_request=1) # key existence check if (steam_history_prices is not None) and ('prices' in steam_history_prices): days = key_existence_check(item, history_prices, steam_history_prices) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
def crawl_item_history_price(index, item, total_price_number): history_prices = [] item_id = item.id steam_price_url = steam_price_history_url(item_id) log.info('GET steam history price {}/{} for ({}): {}'.format( index, total_price_number, item.name, steam_price_url)) steam_history_prices = requester.get_json_dict(steam_price_url) if steam_history_prices is not None: days = steam_history_prices['data']['days'] raw_price_history = steam_history_prices['data']['price_history'] for pair in raw_price_history: if len(pair) == 2: history_prices.append(float(pair[1]) * DOLLAR_TO_CNY) # set history price if exist if len(history_prices) != 0: item.set_history_prices(history_prices, days) log.info( 'totally {} pieces of price history in {} days for {}\n'.format( len(history_prices), days, item.name))
async def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, config.BUFF_COOKIE) category_items = [] tasks = [] timeout = aiohttp.ClientTimeout(total=30 * 60) if config.PROXY: # use socks connector = ProxyConnector.from_url(config.PROXY, limit=5) else: connector = aiohttp.TCPConnector(limit=5) if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error( 'Please paste correct buff cookie to config, current cookie:' + str(config.BUFF_COOKIE)) return None if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error( "No specific page and count info for root page. Please check buff data structure." ) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] # buff有个page_size参数,默认一页请求20个item,最多80 # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里 use_max_page_size = True max_page_size = 80 default_page_size = 20 # 使用80一页后,新的页码 if use_max_page_size: total_page = math.ceil(total_count / max_page_size) log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) async with aiohttp.ClientSession(cookies=config.STEAM_COOKIE, headers=get_headers(), connector=connector, timeout=timeout) as session: # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url( category, page_num, page_size=max_page_size if use_max_page_size else default_page_size) page_json = get_json_dict(page_url, config.BUFF_COOKIE) if (page_json is not None) and ('data' in page_json) and ( 'items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) try: tasks.append( async_crawl_item_history_price( len(category_items), category_items[-1], session)) except Exception as e: log.error(traceback.format_exc()) stamp = time.time() try: await asyncio.gather(*tasks) except Exception as e: log.error(traceback.format_exc()) tasks = [] if not exist(page_url): await timer.async_sleep_awhile(0, time.time() - stamp) else: log.warn( "No specific data for page {}. Skip this page.".format( page_url)) return category_items
def crawl_goods_by_price_section(category=None): """ 针对category这一类商品进行价格搜索 手段是通过buff自带的api接口返回的json 如果catefory = None 则对全物品进行搜索 """ proxy = next(cycle(proxies)) root_url = goods_section_root_url(category) #category对应物品名称 #根据设置的最高最低价界限 #获取搜索的根目录,即page = 0 的 api接口 #由于buff的bug,总搜索结果需要用一个极大的数字去获取,故区分于goods_section_page_url log.info('GET: {}'.format(root_url)) root_json = requester.get_json_dict(root_url,proxy) #从api接口中获取json """ json的基本格式如下 { "code": "OK", "data": { "items": [], "page_num": 6, "page_size": 20, "total_count": 0, "total_page": 0 }, "msg": null } """ category_items = [] if root_json is not None: if 'data' not in root_json: #如果json中没有数据.报错 log.info('Error happens!') log.info('网站返回信息:') log.info(root_json) if 'error' in root_json: #返回error情况 log.info('错误为: ' + root_json['error']) log.info('如果是登录问题,请先在浏览器登录buff,再粘贴正确的cookie到程序中。当前粘贴的cookie为:' + COOKIE) sys.exit(1) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] if total_count == 0: log.info('{} 没有符合要求的item!'.format(category)) else: log.info('共有{}件物品item满足爬取条件,一共{}页'.format(total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('第 {} 页 / 共 {} 页'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) #获取 每页的搜索结果 proxy = next(cycle(proxies)) page_json = requester.get_json_dict(page_url,proxy) #获取 每页的json目录 if page_json is not None: # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) #从json目录中获取item的价格 名称 等数据 #csgo_item是一个特殊的类,其包含各种参数,价格名称等 if csgo_item is not None: category_items.append(csgo_item) #将满足条件的item添加到category_item中 return category_items